3 Commits

Author SHA1 Message Date
lapwat
be69854b17 add reverse option 2022-02-21 00:32:39 +01:00
lapwat
d8a3cc027f fix: selector depth 2022-02-06 23:35:35 +01:00
lapwat
be45a8f744 update installation instructions 2022-02-05 11:58:02 +01:00
7 changed files with 79 additions and 41 deletions

View File

@@ -1,23 +1,22 @@
# Papeer
Papeer is a powerful an **ereader internet vacuum**. It can scrape any website, removing ads and keeping only the relevant content (formatted text and images). You can export the content to Markdown, EPUB or MOBI files.
Papeer is a powerful **ereader internet vacuum**. It can scrape any website, removing ads and keeping only the relevant content (formatted text and images). You can export the content to Markdown, EPUB or MOBI files.
# Table of contents
- [Usage](#usage)
- [Examples](#examples)
* [Grab a single page](#grab-a-single-page)
* [Grab several pages](#grab-several-pages)
+ [`selector` option](#-selector--option)
* [Scrape a web page](#scrape-a-web-page)
* [Scrape a whole website](#scrape-a-whole-website)
+ [`depth` option](#-depth--option)
+ [Display table of contents](#display-table-of-contents)
+ [`selector` option](#-selector--option)
+ [Display the table of contents](#display-the-table-of-contents)
+ [Scrape time](#scrape-time)
- [Installation](#installation)
* [From source](#from-source)
* [From binary](#from-binary)
+ [Linux / MacOS](#linux---macos)
+ [Windows](#windows)
* [MOBI support (optional)](#mobi-support--optional-)
* [MOBI support](#mobi-support)
- [Autocompletion](#autocompletion)
- [Dependencies](#dependencies)
@@ -49,6 +48,7 @@ Flags:
-o, --offset int skip first chapters, use with depth/selector
--output string file name (default: book name)
-q, --quiet hide progress bar
-r, --reverse reverse chapter order
-s, --selector strings table of contents CSS selector
-t, --threads int download concurrency, use with depth/selector (default -1)
--use-link-name use link name for chapter title
@@ -64,9 +64,9 @@ You can activate this mode by using the `depth` or `selector` options.
This option defaults to 0, `papeer` will grab only the main page.
If you specify a value greater than 0, `papeer` will grab only the pages as deep as the value you specify.
If you specify a value greater than 0, `papeer` will grab pages as deep as the value you specify.
> Using `include` option will include all intermediary levels.
> Using `include` option will include all intermediary levels into the book.
### `selector` option
@@ -76,9 +76,9 @@ If this option is specified, `papeer` will select the links (a HTML tag) present
You can chain this option to grab several level of pages with diferent selectors for each level.
## Display the table of contents
### Display the table of contents
Before actually scraping a whole website, it is a good idea to use the `list` command. This command is like a **dry run**, which lets you vizualize the content before actually retrieving it. You can use several options to customize the table of contents extraction, such as `selector`, `limit`, `offset` and `include`. Type `papeer list --help` for more information about those options.
Before actually scraping a whole website, it is a good idea to use the `list` command. This command is like a **dry run**, which lets you vizualize the content before actually retrieving it. You can use several options to customize the table of contents extraction, such as `selector`, `limit`, `offset`, `reverse` and `include`. Type `papeer list --help` for more information about those options.
```sh
papeer list https://12factor.net/ -s 'section.concrete>article>h2>a'
@@ -138,7 +138,7 @@ go get -u github.com/lapwat/papeer
```sh
# use platform=darwin for MacOS
platform=linux
release=0.4.0
release=0.4.2
# download and extract
curl -L https://github.com/lapwat/papeer/releases/download/v$release/papeer-v$release-$platform-amd64.tar.gz > papeer.tar.gz
@@ -146,15 +146,14 @@ tar xzvf papeer.tar.gz
rm papeer.tar.gz
# move to user binaries
chmod +x papeer
sudo mv papeer /usr/local/bin
```
### Windows
Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.4.0/papeer-v0.4.0-windows-amd64.exe).
Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.4.2/papeer-v0.4.2-windows-amd64.exe.zip).
## (optional) MOBI support
## MOBI support
Install kindlegen to convert websites, Linux only

View File

@@ -23,6 +23,7 @@ type ScrapeConfig struct {
Quiet bool
Limit int
Offset int
Reverse bool
Delay int
Threads int
Include bool
@@ -31,7 +32,7 @@ type ScrapeConfig struct {
}
func NewScrapeConfig() *ScrapeConfig {
return &ScrapeConfig{0, "", false, -1, 0, -1, -1, true, false, false}
return &ScrapeConfig{0, "", false, -1, 0, false, -1, -1, true, false, false}
}
func NewScrapeConfigs(selectors []string) []*ScrapeConfig {
@@ -167,21 +168,24 @@ func NewChapterFromURL(url, linkName string, configs []*ScrapeConfig, index int,
updateProgressBarName(index, name)
}
subchapters := []chapter{}
var subchapters []chapter
if len(configs) > 1 {
// add subchapters
links, _, _, err := GetLinks(base, config.Selector, config.Limit, config.Offset, false)
// retrieve links on page
links, _, _, err := GetLinks(base, config.Selector, config.Limit, config.Offset, config.Reverse, false)
if err != nil {
log.Fatal(err)
}
subchapters = make([]chapter, len(links))
// init progess bar
var p progress
if config.Quiet == false {
p = NewProgress(links, name, config.Depth)
}
// init chapters list
subchapters = make([]chapter, len(links))
if config.Delay >= 0 {
// synchronous mode
@@ -277,7 +281,7 @@ func tableOfContent(url string, config *ScrapeConfig, subConfig *ScrapeConfig, q
log.Fatal(err)
}
links, _, home, err := GetLinks(base, config.Selector, config.Limit, config.Offset, config.Include)
links, _, home, err := GetLinks(base, config.Selector, config.Limit, config.Offset, config.Reverse, config.Include)
if err != nil {
log.Fatal(err)
}
@@ -370,7 +374,7 @@ func GetPath(elm *goquery.Selection) string {
return join
}
func GetLinks(url *urllib.URL, selector string, limit, offset int, include bool) ([]link, string, chapter, error) {
func GetLinks(url *urllib.URL, selector string, limit, offset int, reverse, include bool) ([]link, string, chapter, error) {
selectorSet := true
if len(selector) == 0 {
selector = "a"
@@ -434,10 +438,18 @@ func GetLinks(url *urllib.URL, selector string, limit, offset int, include bool)
home := NewChapterFromURL(url.String(), "", []*ScrapeConfig{NewScrapeConfig()}, 0, func(index int, name string) {})
// include home page
if include {
l := NewLink(url.String(), home.Name())
links = append([]link{l}, links...)
}
// reverse links
if reverse {
for i, j := 0, len(links)-1; i < j; i, j = i+1, j-1 {
links[i], links[j] = links[j], links[i]
}
}
return links, pathMax, home, nil
}

View File

@@ -182,6 +182,24 @@ func TestSubChaptersLimitOver(t *testing.T) {
}
func TestReverse(t *testing.T) {
config0 := NewScrapeConfig()
config0.Reverse = true
config1 := NewScrapeConfig()
c := NewChapterFromURL("https://books.lapw.at/", "", []*ScrapeConfig{config0, config1}, 0, func(index int, name string) {})
got := c.SubChapters()[0].Name()
want := "The Twelve-Factor App"
if got != want {
t.Errorf("got %v, wanted %v", got, want)
}
}
func TestNotInclude(t *testing.T) {
config := NewScrapeConfig()

View File

@@ -25,6 +25,7 @@ type GetOptions struct {
depth int
limit int
offset int
reverse bool
delay int
threads int
// includeUrl bool
@@ -49,6 +50,7 @@ func init() {
getCmd.Flags().IntVarP(&getOpts.depth, "depth", "d", 0, "scraping depth")
getCmd.Flags().IntVarP(&getOpts.limit, "limit", "l", -1, "limit number of chapters, use with depth/selector")
getCmd.Flags().IntVarP(&getOpts.offset, "offset", "o", 0, "skip first chapters, use with depth/selector")
getCmd.Flags().BoolVarP(&getOpts.reverse, "reverse", "r", false, "reverse chapter order")
getCmd.Flags().IntVarP(&getOpts.delay, "delay", "", -1, "time in milliseconds to wait before downloading next chapter, use with depth/selector")
getCmd.Flags().IntVarP(&getOpts.threads, "threads", "t", -1, "download concurrency, use with depth/selector")
getCmd.Flags().BoolVarP(&getOpts.include, "include", "i", false, "include URL as first chapter, use with depth/selector")
@@ -96,6 +98,10 @@ var getCmd = &cobra.Command{
return errors.New("cannot use offset option if depth/selector is not specified")
}
if cmd.Flags().Changed("reverse") && getOpts.depth == 0 && len(getOpts.Selector) == 0 {
return errors.New("cannot use reverse option if depth/selector is not specified")
}
if cmd.Flags().Changed("delay") && getOpts.depth == 0 && len(getOpts.Selector) == 0 {
return errors.New("cannot use delay option if depth/selector is not specified")
}
@@ -104,6 +110,10 @@ var getCmd = &cobra.Command{
return errors.New("cannot use threads option if depth/selector is not specified")
}
if cmd.Flags().Changed("use-link-name") && getOpts.depth == 0 && len(getOpts.Selector) == 0 {
return errors.New("cannot use use-link-name option if depth/selector is not specified")
}
if cmd.Flags().Changed("delay") && cmd.Flags().Changed("threads") {
return errors.New("cannot use delay and threads options at the same time")
}
@@ -114,7 +124,8 @@ var getCmd = &cobra.Command{
url := args[0]
// fill selector array with empty selectors to match depth
for len(getOpts.Selector) < getOpts.depth+2 {
getOpts.Selector = append(getOpts.Selector, "")
for len(getOpts.Selector) < getOpts.depth+1 {
getOpts.Selector = append(getOpts.Selector, "")
}
@@ -126,6 +137,7 @@ var getCmd = &cobra.Command{
config.Quiet = getOpts.quiet
config.Limit = getOpts.limit
config.Offset = getOpts.offset
config.Reverse = getOpts.reverse
config.Delay = getOpts.delay
config.Threads = getOpts.threads
config.ImagesOnly = getOpts.images

View File

@@ -21,6 +21,7 @@ type ListOptions struct {
depth int
limit int
offset int
reverse bool
delay int
threads int
// includeUrl bool
@@ -33,10 +34,12 @@ var listOpts *ListOptions
func init() {
listOpts = &ListOptions{}
// common with get command
listCmd.Flags().StringSliceVarP(&listOpts.Selector, "selector", "s", []string{}, "table of contents CSS selector")
listCmd.Flags().IntVarP(&listOpts.depth, "depth", "d", 0, "scraping depth")
listCmd.Flags().IntVarP(&listOpts.limit, "limit", "l", -1, "limit number of chapters, use with depth/selector")
listCmd.Flags().IntVarP(&listOpts.offset, "offset", "o", 0, "skip first chapters, use with depth/selector")
listCmd.Flags().BoolVarP(&listOpts.reverse, "reverse", "r", false, "reverse chapter order")
listCmd.Flags().IntVarP(&listOpts.delay, "delay", "", -1, "time in milliseconds to wait before downloading next chapter, use with depth/selector")
listCmd.Flags().IntVarP(&listOpts.threads, "threads", "t", -1, "download concurrency, use with depth/selector")
listCmd.Flags().BoolVarP(&listOpts.include, "include", "i", false, "include URL as first chapter, use with depth/selector")
@@ -66,7 +69,7 @@ var listCmd = &cobra.Command{
log.Fatal(err)
}
links, path, _, err := book.GetLinks(base, listOpts.Selector[0], listOpts.limit, listOpts.offset, listOpts.include)
links, path, _, err := book.GetLinks(base, listOpts.Selector[0], listOpts.limit, listOpts.offset, listOpts.reverse, listOpts.include)
if err != nil {
log.Fatal(err)
}

View File

@@ -14,6 +14,6 @@ var versionCmd = &cobra.Command{
Use: "version",
Short: "Print the version number of papeer",
Run: func(cmd *cobra.Command, args []string) {
fmt.Println("papeer v0.4.0")
fmt.Println("papeer v0.4.2")
},
}

View File

@@ -1,34 +1,28 @@
#!/usr/bin/env bash
version=$1
platforms=("linux/amd64" "darwin/amd64" "windows/amd64")
if [ "$#" -ne 1 ]; then
echo "Illegal number of parameters"
echo "Usage: ./release.sh X.X.X"
exit 1
fi
version=$1
platforms=("linux/amd64" "darwin/amd64" "windows/amd64")
for platform in "${platforms[@]}"
do
platform_split=(${platform//\// })
GOOS=${platform_split[0]}
GOARCH=${platform_split[1]}
output_name='papeer-v'$version'-'$GOOS'-'$GOARCH
if [ $GOOS = "windows" ]; then
output_name+='.exe'
fi
env GOOS=$GOOS GOARCH=$GOARCH go build -o $output_name
if [ $? -ne 0 ]; then
echo 'An error has occurred! Aborting the script execution...'
exit 1
fi
output_name=papeer
if [ $GOOS = "windows" ]; then
zip "$output_name.exe.zip" "$output_name"
env GOOS=$GOOS GOARCH=$GOARCH go build -o "$output_name.exe"
zip "$output_name-v$version-$GOOS-$GOARCH.exe.zip" "$output_name.exe"
rm "$output_name.exe"
else
tar czvf "$output_name.tar.gz" "$output_name"
env GOOS=$GOOS GOARCH=$GOARCH go build -o "$output_name"
tar czvf "$output_name-v$version-$GOOS-$GOARCH.tar.gz" "$output_name"
rm "$output_name"
fi
rm "$output_name"
done