refacto, better scrape strategy

This commit is contained in:
lapwat
2021-09-24 02:09:22 +02:00
parent e50adf5e03
commit ee41e49dd1
8 changed files with 157 additions and 124 deletions

View File

@@ -68,20 +68,7 @@ var getCmd = &cobra.Command{
},
Run: func(cmd *cobra.Command, args []string) {
url := args[0]
var b book.Book
if recursive {
b = book.NewBookFromURL(url, selector, include, limit, delay)
} else {
c := book.NewChapterFromURL(url)
b = book.New(c.Name(), c.Author())
b.AddChapter(c)
}
// if quiet == false {
// metadata := fmt.Sprintf("URL : %s\nTitle : %s\nAuthor : %s\nLength : %d\nExcerpt : %s\nSiteName: %s\nImage : %s\nFavicon : %s", url, article.Title, article.Byline, article.Length, article.Excerpt, article.SiteName, article.Image, article.Favicon)
// fmt.Println(metadata)
// }
b := book.NewBookFromURL(url, selector, recursive, include, limit, delay)
if len(output) == 0 {
// set default output

View File

@@ -2,13 +2,14 @@ package cmd
import (
"errors"
"fmt"
"log"
urllib "net/url"
"strings"
"os"
colly "github.com/gocolly/colly/v2"
"github.com/jedib0t/go-pretty/v6/table"
cobra "github.com/spf13/cobra"
"github.com/lapwat/papeer/book"
)
var listCmd = &cobra.Command{
@@ -26,44 +27,22 @@ var listCmd = &cobra.Command{
log.Fatal(err)
}
if selector == "" {
selector = "a"
}
links := book.GetLinks(base, selector)
// visit and count link classes
classesLinks := map[string][]map[string]string{}
classesCount := map[string]int{}
classMax := ""
t := table.NewWriter()
t.SetOutputMirror(os.Stdout)
t.AppendHeader(table.Row{"#", "Name", "Url", "Class"})
c := colly.NewCollector()
c.OnHTML(selector, func(e *colly.HTMLElement) {
href := e.Attr("href")
text := strings.TrimSpace(e.Text)
class := e.Attr("class")
if cmd.Flags().Changed("selector") || class != "" && text != "" {
classesLinks[class] = append(classesLinks[class], map[string]string{
"href": href,
"text": text,
})
classesCount[class]++
if classesCount[class] > classesCount[classMax] {
classMax = class
}
}
})
c.Visit(base.String())
for index, link := range classesLinks[classMax] {
u, err := base.Parse(link["href"])
for index, link := range links {
u, err := base.Parse(link.Href())
if err != nil {
log.Fatal(err)
}
fmt.Printf("Chapter %d: %s %s\n", index+1, link["text"], u.String())
t.AppendRow([]interface{}{index + 1, link.Text(), u.String(), link.Class()})
}
t.Render()
},
}

View File

@@ -14,6 +14,6 @@ var versionCmd = &cobra.Command{
Use: "version",
Short: "Print the version number of papeer",
Run: func(cmd *cobra.Command, args []string) {
fmt.Println("papeer v0.0.2")
fmt.Println("papeer v0.1.1")
},
}