mirror of
https://github.com/NohamR/papeer.git
synced 2026-05-25 20:00:47 +00:00
Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d8a3cc027f | ||
|
|
be45a8f744 |
26
README.md
26
README.md
@@ -1,23 +1,22 @@
|
|||||||
# Papeer
|
# Papeer
|
||||||
|
|
||||||
Papeer is a powerful an **ereader internet vacuum**. It can scrape any website, removing ads and keeping only the relevant content (formatted text and images). You can export the content to Markdown, EPUB or MOBI files.
|
Papeer is a powerful **ereader internet vacuum**. It can scrape any website, removing ads and keeping only the relevant content (formatted text and images). You can export the content to Markdown, EPUB or MOBI files.
|
||||||
|
|
||||||
# Table of contents
|
# Table of contents
|
||||||
|
|
||||||
- [Usage](#usage)
|
- [Usage](#usage)
|
||||||
- [Examples](#examples)
|
* [Scrape a web page](#scrape-a-web-page)
|
||||||
* [Grab a single page](#grab-a-single-page)
|
* [Scrape a whole website](#scrape-a-whole-website)
|
||||||
* [Grab several pages](#grab-several-pages)
|
|
||||||
+ [`selector` option](#-selector--option)
|
|
||||||
+ [`depth` option](#-depth--option)
|
+ [`depth` option](#-depth--option)
|
||||||
+ [Display table of contents](#display-table-of-contents)
|
+ [`selector` option](#-selector--option)
|
||||||
|
+ [Display the table of contents](#display-the-table-of-contents)
|
||||||
+ [Scrape time](#scrape-time)
|
+ [Scrape time](#scrape-time)
|
||||||
- [Installation](#installation)
|
- [Installation](#installation)
|
||||||
* [From source](#from-source)
|
* [From source](#from-source)
|
||||||
* [From binary](#from-binary)
|
* [From binary](#from-binary)
|
||||||
+ [Linux / MacOS](#linux---macos)
|
+ [Linux / MacOS](#linux---macos)
|
||||||
+ [Windows](#windows)
|
+ [Windows](#windows)
|
||||||
* [MOBI support (optional)](#mobi-support--optional-)
|
* [MOBI support](#mobi-support)
|
||||||
- [Autocompletion](#autocompletion)
|
- [Autocompletion](#autocompletion)
|
||||||
- [Dependencies](#dependencies)
|
- [Dependencies](#dependencies)
|
||||||
|
|
||||||
@@ -64,9 +63,9 @@ You can activate this mode by using the `depth` or `selector` options.
|
|||||||
|
|
||||||
This option defaults to 0, `papeer` will grab only the main page.
|
This option defaults to 0, `papeer` will grab only the main page.
|
||||||
|
|
||||||
If you specify a value greater than 0, `papeer` will grab only the pages as deep as the value you specify.
|
If you specify a value greater than 0, `papeer` will grab pages as deep as the value you specify.
|
||||||
|
|
||||||
> Using `include` option will include all intermediary levels.
|
> Using `include` option will include all intermediary levels into the book.
|
||||||
|
|
||||||
### `selector` option
|
### `selector` option
|
||||||
|
|
||||||
@@ -76,7 +75,7 @@ If this option is specified, `papeer` will select the links (a HTML tag) present
|
|||||||
|
|
||||||
You can chain this option to grab several level of pages with diferent selectors for each level.
|
You can chain this option to grab several level of pages with diferent selectors for each level.
|
||||||
|
|
||||||
## Display the table of contents
|
### Display the table of contents
|
||||||
|
|
||||||
Before actually scraping a whole website, it is a good idea to use the `list` command. This command is like a **dry run**, which lets you vizualize the content before actually retrieving it. You can use several options to customize the table of contents extraction, such as `selector`, `limit`, `offset` and `include`. Type `papeer list --help` for more information about those options.
|
Before actually scraping a whole website, it is a good idea to use the `list` command. This command is like a **dry run**, which lets you vizualize the content before actually retrieving it. You can use several options to customize the table of contents extraction, such as `selector`, `limit`, `offset` and `include`. Type `papeer list --help` for more information about those options.
|
||||||
|
|
||||||
@@ -138,7 +137,7 @@ go get -u github.com/lapwat/papeer
|
|||||||
```sh
|
```sh
|
||||||
# use platform=darwin for MacOS
|
# use platform=darwin for MacOS
|
||||||
platform=linux
|
platform=linux
|
||||||
release=0.4.0
|
release=0.4.1
|
||||||
|
|
||||||
# download and extract
|
# download and extract
|
||||||
curl -L https://github.com/lapwat/papeer/releases/download/v$release/papeer-v$release-$platform-amd64.tar.gz > papeer.tar.gz
|
curl -L https://github.com/lapwat/papeer/releases/download/v$release/papeer-v$release-$platform-amd64.tar.gz > papeer.tar.gz
|
||||||
@@ -146,15 +145,14 @@ tar xzvf papeer.tar.gz
|
|||||||
rm papeer.tar.gz
|
rm papeer.tar.gz
|
||||||
|
|
||||||
# move to user binaries
|
# move to user binaries
|
||||||
chmod +x papeer
|
|
||||||
sudo mv papeer /usr/local/bin
|
sudo mv papeer /usr/local/bin
|
||||||
```
|
```
|
||||||
|
|
||||||
### Windows
|
### Windows
|
||||||
|
|
||||||
Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.4.0/papeer-v0.4.0-windows-amd64.exe).
|
Download [latest release](https://github.com/lapwat/papeer/releases/download/v0.4.1/papeer-v0.4.1-windows-amd64.exe.zip).
|
||||||
|
|
||||||
## (optional) MOBI support
|
## MOBI support
|
||||||
|
|
||||||
Install kindlegen to convert websites, Linux only
|
Install kindlegen to convert websites, Linux only
|
||||||
|
|
||||||
|
|||||||
@@ -104,6 +104,10 @@ var getCmd = &cobra.Command{
|
|||||||
return errors.New("cannot use threads option if depth/selector is not specified")
|
return errors.New("cannot use threads option if depth/selector is not specified")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if cmd.Flags().Changed("use-link-name") && getOpts.depth == 0 && len(getOpts.Selector) == 0 {
|
||||||
|
return errors.New("cannot use use-link-name option if depth/selector is not specified")
|
||||||
|
}
|
||||||
|
|
||||||
if cmd.Flags().Changed("delay") && cmd.Flags().Changed("threads") {
|
if cmd.Flags().Changed("delay") && cmd.Flags().Changed("threads") {
|
||||||
return errors.New("cannot use delay and threads options at the same time")
|
return errors.New("cannot use delay and threads options at the same time")
|
||||||
}
|
}
|
||||||
@@ -114,9 +118,11 @@ var getCmd = &cobra.Command{
|
|||||||
url := args[0]
|
url := args[0]
|
||||||
|
|
||||||
// fill selector array with empty selectors to match depth
|
// fill selector array with empty selectors to match depth
|
||||||
for len(getOpts.Selector) < getOpts.depth+2 {
|
getOpts.Selector = append(getOpts.Selector, "")
|
||||||
|
for len(getOpts.Selector) < getOpts.depth+1 {
|
||||||
getOpts.Selector = append(getOpts.Selector, "")
|
getOpts.Selector = append(getOpts.Selector, "")
|
||||||
}
|
}
|
||||||
|
fmt.Println(len(getOpts.Selector))
|
||||||
|
|
||||||
// generate config for each level
|
// generate config for each level
|
||||||
configs := make([]*book.ScrapeConfig, len(getOpts.Selector))
|
configs := make([]*book.ScrapeConfig, len(getOpts.Selector))
|
||||||
|
|||||||
@@ -14,6 +14,6 @@ var versionCmd = &cobra.Command{
|
|||||||
Use: "version",
|
Use: "version",
|
||||||
Short: "Print the version number of papeer",
|
Short: "Print the version number of papeer",
|
||||||
Run: func(cmd *cobra.Command, args []string) {
|
Run: func(cmd *cobra.Command, args []string) {
|
||||||
fmt.Println("papeer v0.4.0")
|
fmt.Println("papeer v0.4.1")
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|||||||
25
release.sh
25
release.sh
@@ -14,21 +14,20 @@ do
|
|||||||
platform_split=(${platform//\// })
|
platform_split=(${platform//\// })
|
||||||
GOOS=${platform_split[0]}
|
GOOS=${platform_split[0]}
|
||||||
GOARCH=${platform_split[1]}
|
GOARCH=${platform_split[1]}
|
||||||
output_name='papeer-v'$version'-'$GOOS'-'$GOARCH
|
output_name=papeer
|
||||||
if [ $GOOS = "windows" ]; then
|
|
||||||
output_name+='.exe'
|
|
||||||
fi
|
|
||||||
|
|
||||||
env GOOS=$GOOS GOARCH=$GOARCH go build -o $output_name
|
|
||||||
if [ $? -ne 0 ]; then
|
|
||||||
echo 'An error has occurred! Aborting the script execution...'
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ $GOOS = "windows" ]; then
|
if [ $GOOS = "windows" ]; then
|
||||||
zip "$output_name.exe.zip" "$output_name"
|
env GOOS=$GOOS GOARCH=$GOARCH go build -o "$output_name.exe"
|
||||||
|
zip "$output_name-v$version-$GOOS-$GOARCH.exe.zip" "$output_name.exe"
|
||||||
|
rm "$output_name.exe"
|
||||||
else
|
else
|
||||||
tar czvf "$output_name.tar.gz" "$output_name"
|
env GOOS=$GOOS GOARCH=$GOARCH go build -o "$output_name"
|
||||||
|
tar czvf "$output_name-v$version-$GOOS-$GOARCH.tar.gz" "$output_name"
|
||||||
|
rm "$output_name"
|
||||||
fi
|
fi
|
||||||
rm "$output_name"
|
|
||||||
|
# if [ $? -ne 0 ]; then
|
||||||
|
# echo 'An error has occurred! Aborting the script execution...'
|
||||||
|
# exit 1
|
||||||
|
# fi
|
||||||
done
|
done
|
||||||
|
|||||||
Reference in New Issue
Block a user