mirror of
https://github.com/NohamR/papeer.git
synced 2026-05-24 20:00:45 +00:00
refacto, better scrape strategy
This commit is contained in:
15
cmd/get.go
15
cmd/get.go
@@ -68,20 +68,7 @@ var getCmd = &cobra.Command{
|
||||
},
|
||||
Run: func(cmd *cobra.Command, args []string) {
|
||||
url := args[0]
|
||||
var b book.Book
|
||||
|
||||
if recursive {
|
||||
b = book.NewBookFromURL(url, selector, include, limit, delay)
|
||||
} else {
|
||||
c := book.NewChapterFromURL(url)
|
||||
b = book.New(c.Name(), c.Author())
|
||||
b.AddChapter(c)
|
||||
}
|
||||
|
||||
// if quiet == false {
|
||||
// metadata := fmt.Sprintf("URL : %s\nTitle : %s\nAuthor : %s\nLength : %d\nExcerpt : %s\nSiteName: %s\nImage : %s\nFavicon : %s", url, article.Title, article.Byline, article.Length, article.Excerpt, article.SiteName, article.Image, article.Favicon)
|
||||
// fmt.Println(metadata)
|
||||
// }
|
||||
b := book.NewBookFromURL(url, selector, recursive, include, limit, delay)
|
||||
|
||||
if len(output) == 0 {
|
||||
// set default output
|
||||
|
||||
47
cmd/list.go
47
cmd/list.go
@@ -2,13 +2,14 @@ package cmd
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"log"
|
||||
urllib "net/url"
|
||||
"strings"
|
||||
"os"
|
||||
|
||||
colly "github.com/gocolly/colly/v2"
|
||||
"github.com/jedib0t/go-pretty/v6/table"
|
||||
cobra "github.com/spf13/cobra"
|
||||
|
||||
"github.com/lapwat/papeer/book"
|
||||
)
|
||||
|
||||
var listCmd = &cobra.Command{
|
||||
@@ -26,44 +27,22 @@ var listCmd = &cobra.Command{
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
if selector == "" {
|
||||
selector = "a"
|
||||
}
|
||||
links := book.GetLinks(base, selector)
|
||||
|
||||
// visit and count link classes
|
||||
classesLinks := map[string][]map[string]string{}
|
||||
classesCount := map[string]int{}
|
||||
classMax := ""
|
||||
t := table.NewWriter()
|
||||
t.SetOutputMirror(os.Stdout)
|
||||
t.AppendHeader(table.Row{"#", "Name", "Url", "Class"})
|
||||
|
||||
c := colly.NewCollector()
|
||||
c.OnHTML(selector, func(e *colly.HTMLElement) {
|
||||
href := e.Attr("href")
|
||||
text := strings.TrimSpace(e.Text)
|
||||
class := e.Attr("class")
|
||||
|
||||
if cmd.Flags().Changed("selector") || class != "" && text != "" {
|
||||
classesLinks[class] = append(classesLinks[class], map[string]string{
|
||||
"href": href,
|
||||
"text": text,
|
||||
})
|
||||
|
||||
classesCount[class]++
|
||||
|
||||
if classesCount[class] > classesCount[classMax] {
|
||||
classMax = class
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
c.Visit(base.String())
|
||||
for index, link := range classesLinks[classMax] {
|
||||
u, err := base.Parse(link["href"])
|
||||
for index, link := range links {
|
||||
u, err := base.Parse(link.Href())
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
fmt.Printf("Chapter %d: %s %s\n", index+1, link["text"], u.String())
|
||||
t.AppendRow([]interface{}{index + 1, link.Text(), u.String(), link.Class()})
|
||||
}
|
||||
|
||||
t.Render()
|
||||
|
||||
},
|
||||
}
|
||||
|
||||
@@ -14,6 +14,6 @@ var versionCmd = &cobra.Command{
|
||||
Use: "version",
|
||||
Short: "Print the version number of papeer",
|
||||
Run: func(cmd *cobra.Command, args []string) {
|
||||
fmt.Println("papeer v0.0.2")
|
||||
fmt.Println("papeer v0.1.1")
|
||||
},
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user