2 Commits

Author SHA1 Message Date
lapwat
d8a3cc027f fix: selector depth 2022-02-06 23:35:35 +01:00
lapwat
be45a8f744 update installation instructions 2022-02-05 11:58:02 +01:00
4 changed files with 32 additions and 29 deletions

View File

@@ -1,23 +1,22 @@
# Papeer # Papeer
Papeer is a powerful an **ereader internet vacuum**. It can scrape any website, removing ads and keeping only the relevant content (formatted text and images). You can export the content to Markdown, EPUB or MOBI files. Papeer is a powerful **ereader internet vacuum**. It can scrape any website, removing ads and keeping only the relevant content (formatted text and images). You can export the content to Markdown, EPUB or MOBI files.
# Table of contents # Table of contents
- [Usage](#usage) - [Usage](#usage)
- [Examples](#examples) * [Scrape a web page](#scrape-a-web-page)
* [Grab a single page](#grab-a-single-page) * [Scrape a whole website](#scrape-a-whole-website)
* [Grab several pages](#grab-several-pages)
+ [`selector` option](#-selector--option)
+ [`depth` option](#-depth--option) + [`depth` option](#-depth--option)
+ [Display table of contents](#display-table-of-contents) + [`selector` option](#-selector--option)
+ [Display the table of contents](#display-the-table-of-contents)
+ [Scrape time](#scrape-time) + [Scrape time](#scrape-time)
- [Installation](#installation) - [Installation](#installation)
* [From source](#from-source) * [From source](#from-source)
* [From binary](#from-binary) * [From binary](#from-binary)
+ [Linux / MacOS](#linux---macos) + [Linux / MacOS](#linux---macos)
+ [Windows](#windows) + [Windows](#windows)
* [MOBI support (optional)](#mobi-support--optional-) * [MOBI support](#mobi-support)
- [Autocompletion](#autocompletion) - [Autocompletion](#autocompletion)
- [Dependencies](#dependencies) - [Dependencies](#dependencies)
@@ -64,9 +63,9 @@ You can activate this mode by using the `depth` or `selector` options.
This option defaults to 0, `papeer` will grab only the main page. This option defaults to 0, `papeer` will grab only the main page.
If you specify a value greater than 0, `papeer` will grab only the pages as deep as the value you specify. If you specify a value greater than 0, `papeer` will grab pages as deep as the value you specify.
> Using `include` option will include all intermediary levels. > Using `include` option will include all intermediary levels into the book.
### `selector` option ### `selector` option
@@ -76,7 +75,7 @@ If this option is specified, `papeer` will select the links (a HTML tag) present
You can chain this option to grab several level of pages with diferent selectors for each level. You can chain this option to grab several level of pages with diferent selectors for each level.
## Display the table of contents ### Display the table of contents
Before actually scraping a whole website, it is a good idea to use the `list` command. This command is like a **dry run**, which lets you vizualize the content before actually retrieving it. You can use several options to customize the table of contents extraction, such as `selector`, `limit`, `offset` and `include`. Type `papeer list --help` for more information about those options. Before actually scraping a whole website, it is a good idea to use the `list` command. This command is like a **dry run**, which lets you vizualize the content before actually retrieving it. You can use several options to customize the table of contents extraction, such as `selector`, `limit`, `offset` and `include`. Type `papeer list --help` for more information about those options.
@@ -138,7 +137,7 @@ go get -u github.com/lapwat/papeer
```sh ```sh
# use platform=darwin for MacOS # use platform=darwin for MacOS
platform=linux platform=linux
release=0.4.0 release=0.4.1
# download and extract # download and extract
curl -L https://github.com/lapwat/papeer/releases/download/v$release/papeer-v$release-$platform-amd64.tar.gz > papeer.tar.gz curl -L https://github.com/lapwat/papeer/releases/download/v$release/papeer-v$release-$platform-amd64.tar.gz > papeer.tar.gz
@@ -146,15 +145,14 @@ tar xzvf papeer.tar.gz
rm papeer.tar.gz rm papeer.tar.gz
# move to user binaries # move to user binaries
chmod +x papeer
sudo mv papeer /usr/local/bin sudo mv papeer /usr/local/bin
``` ```
### Windows ### Windows
Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.4.0/papeer-v0.4.0-windows-amd64.exe). Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.4.1/papeer-v0.4.1-windows-amd64.exe.zip).
## (optional) MOBI support ## MOBI support
Install kindlegen to convert websites, Linux only Install kindlegen to convert websites, Linux only

View File

@@ -104,6 +104,10 @@ var getCmd = &cobra.Command{
return errors.New("cannot use threads option if depth/selector is not specified") return errors.New("cannot use threads option if depth/selector is not specified")
} }
if cmd.Flags().Changed("use-link-name") && getOpts.depth == 0 && len(getOpts.Selector) == 0 {
return errors.New("cannot use use-link-name option if depth/selector is not specified")
}
if cmd.Flags().Changed("delay") && cmd.Flags().Changed("threads") { if cmd.Flags().Changed("delay") && cmd.Flags().Changed("threads") {
return errors.New("cannot use delay and threads options at the same time") return errors.New("cannot use delay and threads options at the same time")
} }
@@ -114,9 +118,11 @@ var getCmd = &cobra.Command{
url := args[0] url := args[0]
// fill selector array with empty selectors to match depth // fill selector array with empty selectors to match depth
for len(getOpts.Selector) < getOpts.depth+2 { getOpts.Selector = append(getOpts.Selector, "")
for len(getOpts.Selector) < getOpts.depth+1 {
getOpts.Selector = append(getOpts.Selector, "") getOpts.Selector = append(getOpts.Selector, "")
} }
fmt.Println(len(getOpts.Selector))
// generate config for each level // generate config for each level
configs := make([]*book.ScrapeConfig, len(getOpts.Selector)) configs := make([]*book.ScrapeConfig, len(getOpts.Selector))

View File

@@ -14,6 +14,6 @@ var versionCmd = &cobra.Command{
Use: "version", Use: "version",
Short: "Print the version number of papeer", Short: "Print the version number of papeer",
Run: func(cmd *cobra.Command, args []string) { Run: func(cmd *cobra.Command, args []string) {
fmt.Println("papeer v0.4.0") fmt.Println("papeer v0.4.1")
}, },
} }

View File

@@ -14,21 +14,20 @@ do
platform_split=(${platform//\// }) platform_split=(${platform//\// })
GOOS=${platform_split[0]} GOOS=${platform_split[0]}
GOARCH=${platform_split[1]} GOARCH=${platform_split[1]}
output_name='papeer-v'$version'-'$GOOS'-'$GOARCH output_name=papeer
if [ $GOOS = "windows" ]; then
output_name+='.exe'
fi
env GOOS=$GOOS GOARCH=$GOARCH go build -o $output_name
if [ $? -ne 0 ]; then
echo 'An error has occurred! Aborting the script execution...'
exit 1
fi
if [ $GOOS = "windows" ]; then if [ $GOOS = "windows" ]; then
zip "$output_name.exe.zip" "$output_name" env GOOS=$GOOS GOARCH=$GOARCH go build -o "$output_name.exe"
zip "$output_name-v$version-$GOOS-$GOARCH.exe.zip" "$output_name.exe"
rm "$output_name.exe"
else else
tar czvf "$output_name.tar.gz" "$output_name" env GOOS=$GOOS GOARCH=$GOARCH go build -o "$output_name"
tar czvf "$output_name-v$version-$GOOS-$GOARCH.tar.gz" "$output_name"
rm "$output_name"
fi fi
rm "$output_name"
# if [ $? -ne 0 ]; then
# echo 'An error has occurred! Aborting the script execution...'
# exit 1
# fi
done done