375 lines
8 KiB
Go
375 lines
8 KiB
Go
package parsers
|
||
|
||
import (
|
||
"context"
|
||
"fmt"
|
||
"github.com/chromedp/cdproto/network"
|
||
"github.com/chromedp/chromedp"
|
||
log "github.com/sirupsen/logrus"
|
||
"golang.org/x/net/html"
|
||
"net/http"
|
||
"net/http/cookiejar"
|
||
"net/url"
|
||
"regexp"
|
||
"slices"
|
||
"strconv"
|
||
"strings"
|
||
"sync"
|
||
"task-processor/internal/appState"
|
||
"task-processor/internal/shared"
|
||
"time"
|
||
)
|
||
|
||
type MandarakeParser struct {
|
||
goroutinesNumber int
|
||
parseParams parseParams
|
||
client *http.Client
|
||
}
|
||
|
||
type parseParams struct {
|
||
userAgent string
|
||
cookieUrl string
|
||
single price
|
||
ranged price
|
||
taxMult float64
|
||
}
|
||
|
||
type price struct {
|
||
tag string
|
||
attrKey string
|
||
attrVal string
|
||
subTag string
|
||
substring string
|
||
}
|
||
|
||
func NewMandarakeParser(goroutinesNumber int) *MandarakeParser {
|
||
p := parseParams{
|
||
userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||
cookieUrl: "https://www.mandarake.co.jp/",
|
||
single: price{
|
||
tag: "div",
|
||
attrKey: "class",
|
||
attrVal: "price",
|
||
subTag: "p",
|
||
substring: "円",
|
||
},
|
||
ranged: price{
|
||
tag: "div",
|
||
attrKey: "class",
|
||
attrVal: "price_range",
|
||
subTag: "p",
|
||
substring: "円"},
|
||
taxMult: 1.1,
|
||
}
|
||
|
||
return &MandarakeParser{
|
||
goroutinesNumber: goroutinesNumber,
|
||
parseParams: p,
|
||
}
|
||
}
|
||
|
||
func (s *MandarakeParser) HandleTasks(tasks []shared.Task, sender chan shared.TaskResult, state *appState.State) {
|
||
log.Debug("Handling Mandarake tasks")
|
||
|
||
if err := s.initClient2(); err != nil {
|
||
log.WithError(err).Error("Mandarake handler | Error initializing client")
|
||
return
|
||
}
|
||
|
||
receiver := make(chan shared.Task, len(tasks))
|
||
for _, task := range tasks {
|
||
receiver <- task
|
||
}
|
||
close(receiver)
|
||
|
||
wg := sync.WaitGroup{}
|
||
for i := 0; i < s.goroutinesNumber; i++ {
|
||
wg.Add(1)
|
||
go func() {
|
||
defer wg.Done()
|
||
s.worker(receiver, sender, state)
|
||
}()
|
||
}
|
||
wg.Wait()
|
||
log.Debug("Finished handling Mandarake tasks")
|
||
}
|
||
|
||
func (s *MandarakeParser) worker(receiver chan shared.Task, sender chan shared.TaskResult, state *appState.State) {
|
||
for task := range receiver {
|
||
log.WithField("task id", task.MerchUuid).Debug("Mandarake worker | Processing task")
|
||
|
||
page, err := s.getPage(task.Link)
|
||
if err != nil {
|
||
log.WithError(err).Error("Mandarake worker | Error getting page for task")
|
||
continue
|
||
}
|
||
|
||
if page == nil {
|
||
log.Debug("Mandarake worker | Page for task is nil")
|
||
continue
|
||
}
|
||
|
||
p := int32(s.getMinPrice(page))
|
||
|
||
sender <- shared.TaskResult{
|
||
MerchUuid: task.MerchUuid,
|
||
Origin: task.Origin,
|
||
Price: p,
|
||
}
|
||
}
|
||
}
|
||
|
||
// Deprecated: use initClient2 instead.
|
||
func (s *MandarakeParser) initClient() error {
|
||
//preload cookies for client
|
||
req, err := http.NewRequest("GET", s.parseParams.cookieUrl, nil)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
|
||
//TODO сделать один клиент с одним джаром
|
||
client := http.Client{}
|
||
req.Header.Set("User-Agent", s.parseParams.userAgent)
|
||
|
||
result, err := client.Do(req)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
defer result.Body.Close()
|
||
|
||
c := result.Cookies()
|
||
log.WithField("cookies", c).Debug("Mandarake handler | Get cookies")
|
||
|
||
//make client
|
||
jar, err := cookiejar.New(nil)
|
||
if err != nil {
|
||
log.WithError(err).Error("Mandarake | Cookie jar")
|
||
return err
|
||
}
|
||
|
||
u, err := url.Parse(s.parseParams.cookieUrl)
|
||
if err != nil {
|
||
log.WithError(err).Error("Mandarake | Parse cookie URL")
|
||
return err
|
||
}
|
||
|
||
jar.SetCookies(u, c)
|
||
|
||
taskClient := &http.Client{
|
||
Timeout: time.Second * 30,
|
||
Jar: jar,
|
||
}
|
||
|
||
s.client = taskClient
|
||
return nil
|
||
}
|
||
|
||
func (s *MandarakeParser) getPage(url string) (*html.Node, error) {
|
||
req, err := http.NewRequest("GET", url, nil)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
req.Header.Set("User-Agent", s.parseParams.userAgent)
|
||
|
||
result, err := s.client.Do(req)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
|
||
doc, err := html.Parse(result.Body)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
|
||
return doc, nil
|
||
}
|
||
|
||
func (s *MandarakeParser) getMinPrice(page *html.Node) int {
|
||
singlePriceNode := s.findNode(page, s.parseParams.single)
|
||
if singlePriceNode == nil {
|
||
return 0
|
||
}
|
||
singlePriceStr := s.findData(singlePriceNode, s.parseParams.single)
|
||
if singlePriceStr == nil {
|
||
return 0
|
||
}
|
||
|
||
var prices []int
|
||
prices = append(prices, s.getPrice(singlePriceStr))
|
||
|
||
priceRangeNode := s.findNode(page, s.parseParams.ranged)
|
||
if priceRangeNode != nil {
|
||
priceFromRange := s.findData(priceRangeNode, s.parseParams.ranged)
|
||
if priceFromRange != nil {
|
||
withTax := int(float64(s.getPrice(priceFromRange)) * s.parseParams.taxMult)
|
||
prices = append(prices, withTax)
|
||
}
|
||
}
|
||
return slices.Min(prices)
|
||
}
|
||
|
||
func (s *MandarakeParser) findNode(doc *html.Node, params price) *html.Node {
|
||
if doc == nil {
|
||
return nil
|
||
}
|
||
|
||
var (
|
||
crawler func(*html.Node)
|
||
result *html.Node
|
||
)
|
||
|
||
crawler = func(node *html.Node) {
|
||
if result != nil {
|
||
return
|
||
}
|
||
|
||
if node.Type == html.ElementNode && node.Data == params.tag {
|
||
for _, attr := range node.Attr {
|
||
if attr.Key == params.attrKey && attr.Val == params.attrVal {
|
||
result = node
|
||
return
|
||
}
|
||
}
|
||
}
|
||
for child := node.FirstChild; child != nil; child = child.NextSibling {
|
||
crawler(child)
|
||
}
|
||
}
|
||
crawler(doc)
|
||
|
||
return result
|
||
}
|
||
|
||
func (s *MandarakeParser) findData(doc *html.Node, params price) []string {
|
||
if doc == nil {
|
||
return nil
|
||
}
|
||
|
||
var (
|
||
crawler func(*html.Node)
|
||
values []string
|
||
getText func(*html.Node) string
|
||
)
|
||
|
||
getText = func(n *html.Node) string {
|
||
if n.Type == html.TextNode {
|
||
return n.Data
|
||
}
|
||
var result strings.Builder
|
||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||
result.WriteString(getText(c))
|
||
}
|
||
return result.String()
|
||
}
|
||
|
||
crawler = func(node *html.Node) {
|
||
if node.Type == html.ElementNode && node.Data == params.subTag {
|
||
text := strings.TrimSpace(getText(node))
|
||
if strings.Contains(text, params.substring) {
|
||
values = append(values, text)
|
||
}
|
||
}
|
||
for child := node.FirstChild; child != nil; child = child.NextSibling {
|
||
crawler(child)
|
||
}
|
||
}
|
||
crawler(doc)
|
||
return values
|
||
}
|
||
|
||
func (s *MandarakeParser) getPrice(rawStr []string) int {
|
||
re := regexp.MustCompile(`\([^)]*?([0-9,]+)[^)]*?\)`)
|
||
|
||
for _, str := range rawStr {
|
||
matches := re.FindStringSubmatch(str)
|
||
if len(matches) > 1 {
|
||
priceStr := strings.ReplaceAll(matches[1], ",", "")
|
||
price, err := strconv.Atoi(priceStr)
|
||
if err != nil {
|
||
fmt.Println(err)
|
||
return 0
|
||
}
|
||
return price
|
||
}
|
||
}
|
||
return 0
|
||
}
|
||
|
||
//new client
|
||
|
||
func (s *MandarakeParser) initClient2() error {
|
||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||
defer cancel()
|
||
|
||
ctx, _ = chromedp.NewContext(ctx)
|
||
|
||
if err := chromedp.Run(ctx,
|
||
chromedp.Navigate(s.parseParams.cookieUrl),
|
||
chromedp.WaitVisible("body", chromedp.ByQuery),
|
||
); err != nil {
|
||
return fmt.Errorf("failed to navigate: %w", err)
|
||
}
|
||
|
||
var cookies []*network.Cookie
|
||
err := chromedp.Run(ctx, chromedp.ActionFunc(func(ctx context.Context) error {
|
||
var err error
|
||
cookies, err = network.GetCookies().Do(ctx)
|
||
return err
|
||
}))
|
||
if err != nil {
|
||
return fmt.Errorf("failed to get cookies: %w", err)
|
||
}
|
||
|
||
var httpCookies []*http.Cookie
|
||
for _, c := range cookies {
|
||
httpCookies = append(httpCookies, &http.Cookie{
|
||
Name: c.Name,
|
||
Value: c.Value,
|
||
Path: c.Path,
|
||
Domain: c.Domain,
|
||
Expires: float64ToTime(c.Expires),
|
||
Secure: c.Secure,
|
||
HttpOnly: c.HTTPOnly,
|
||
SameSite: convertSameSite(c.SameSite),
|
||
})
|
||
}
|
||
|
||
jar, err := cookiejar.New(nil)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
|
||
u, err := url.Parse(s.parseParams.cookieUrl)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
|
||
jar.SetCookies(u, httpCookies)
|
||
|
||
client := &http.Client{
|
||
Jar: jar,
|
||
Timeout: 30 * time.Second,
|
||
}
|
||
|
||
s.client = client
|
||
return nil
|
||
}
|
||
|
||
func convertSameSite(s network.CookieSameSite) http.SameSite {
|
||
switch s {
|
||
case network.CookieSameSiteStrict:
|
||
return http.SameSiteStrictMode
|
||
case network.CookieSameSiteLax:
|
||
return http.SameSiteLaxMode
|
||
case network.CookieSameSiteNone:
|
||
return http.SameSiteNoneMode
|
||
default:
|
||
return http.SameSiteDefaultMode
|
||
}
|
||
}
|
||
|
||
func float64ToTime(unixFloat float64) time.Time {
|
||
sec := int64(unixFloat)
|
||
nsec := int64((unixFloat - float64(sec)) * 1e9)
|
||
return time.Unix(sec, nsec)
|
||
}
|