task-processor/internal/parsers/mandarake.go
nquidox ae4dd4bf63
All checks were successful
/ Make image (push) Successful in 38s
new client
2025-12-20 16:06:05 +03:00

375 lines
8 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package parsers
import (
"context"
"fmt"
"github.com/chromedp/cdproto/network"
"github.com/chromedp/chromedp"
log "github.com/sirupsen/logrus"
"golang.org/x/net/html"
"net/http"
"net/http/cookiejar"
"net/url"
"regexp"
"slices"
"strconv"
"strings"
"sync"
"task-processor/internal/appState"
"task-processor/internal/shared"
"time"
)
type MandarakeParser struct {
goroutinesNumber int
parseParams parseParams
client *http.Client
}
type parseParams struct {
userAgent string
cookieUrl string
single price
ranged price
taxMult float64
}
type price struct {
tag string
attrKey string
attrVal string
subTag string
substring string
}
func NewMandarakeParser(goroutinesNumber int) *MandarakeParser {
p := parseParams{
userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
cookieUrl: "https://www.mandarake.co.jp/",
single: price{
tag: "div",
attrKey: "class",
attrVal: "price",
subTag: "p",
substring: "円",
},
ranged: price{
tag: "div",
attrKey: "class",
attrVal: "price_range",
subTag: "p",
substring: "円"},
taxMult: 1.1,
}
return &MandarakeParser{
goroutinesNumber: goroutinesNumber,
parseParams: p,
}
}
func (s *MandarakeParser) HandleTasks(tasks []shared.Task, sender chan shared.TaskResult, state *appState.State) {
log.Debug("Handling Mandarake tasks")
if err := s.initClient2(); err != nil {
log.WithError(err).Error("Mandarake handler | Error initializing client")
return
}
receiver := make(chan shared.Task, len(tasks))
for _, task := range tasks {
receiver <- task
}
close(receiver)
wg := sync.WaitGroup{}
for i := 0; i < s.goroutinesNumber; i++ {
wg.Add(1)
go func() {
defer wg.Done()
s.worker(receiver, sender, state)
}()
}
wg.Wait()
log.Debug("Finished handling Mandarake tasks")
}
func (s *MandarakeParser) worker(receiver chan shared.Task, sender chan shared.TaskResult, state *appState.State) {
for task := range receiver {
log.WithField("task id", task.MerchUuid).Debug("Mandarake worker | Processing task")
page, err := s.getPage(task.Link)
if err != nil {
log.WithError(err).Error("Mandarake worker | Error getting page for task")
continue
}
if page == nil {
log.Debug("Mandarake worker | Page for task is nil")
continue
}
p := int32(s.getMinPrice(page))
sender <- shared.TaskResult{
MerchUuid: task.MerchUuid,
Origin: task.Origin,
Price: p,
}
}
}
// Deprecated: use initClient2 instead.
func (s *MandarakeParser) initClient() error {
//preload cookies for client
req, err := http.NewRequest("GET", s.parseParams.cookieUrl, nil)
if err != nil {
return err
}
//TODO сделать один клиент с одним джаром
client := http.Client{}
req.Header.Set("User-Agent", s.parseParams.userAgent)
result, err := client.Do(req)
if err != nil {
return err
}
defer result.Body.Close()
c := result.Cookies()
log.WithField("cookies", c).Debug("Mandarake handler | Get cookies")
//make client
jar, err := cookiejar.New(nil)
if err != nil {
log.WithError(err).Error("Mandarake | Cookie jar")
return err
}
u, err := url.Parse(s.parseParams.cookieUrl)
if err != nil {
log.WithError(err).Error("Mandarake | Parse cookie URL")
return err
}
jar.SetCookies(u, c)
taskClient := &http.Client{
Timeout: time.Second * 30,
Jar: jar,
}
s.client = taskClient
return nil
}
func (s *MandarakeParser) getPage(url string) (*html.Node, error) {
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", s.parseParams.userAgent)
result, err := s.client.Do(req)
if err != nil {
return nil, err
}
doc, err := html.Parse(result.Body)
if err != nil {
return nil, err
}
return doc, nil
}
func (s *MandarakeParser) getMinPrice(page *html.Node) int {
singlePriceNode := s.findNode(page, s.parseParams.single)
if singlePriceNode == nil {
return 0
}
singlePriceStr := s.findData(singlePriceNode, s.parseParams.single)
if singlePriceStr == nil {
return 0
}
var prices []int
prices = append(prices, s.getPrice(singlePriceStr))
priceRangeNode := s.findNode(page, s.parseParams.ranged)
if priceRangeNode != nil {
priceFromRange := s.findData(priceRangeNode, s.parseParams.ranged)
if priceFromRange != nil {
withTax := int(float64(s.getPrice(priceFromRange)) * s.parseParams.taxMult)
prices = append(prices, withTax)
}
}
return slices.Min(prices)
}
func (s *MandarakeParser) findNode(doc *html.Node, params price) *html.Node {
if doc == nil {
return nil
}
var (
crawler func(*html.Node)
result *html.Node
)
crawler = func(node *html.Node) {
if result != nil {
return
}
if node.Type == html.ElementNode && node.Data == params.tag {
for _, attr := range node.Attr {
if attr.Key == params.attrKey && attr.Val == params.attrVal {
result = node
return
}
}
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
crawler(child)
}
}
crawler(doc)
return result
}
func (s *MandarakeParser) findData(doc *html.Node, params price) []string {
if doc == nil {
return nil
}
var (
crawler func(*html.Node)
values []string
getText func(*html.Node) string
)
getText = func(n *html.Node) string {
if n.Type == html.TextNode {
return n.Data
}
var result strings.Builder
for c := n.FirstChild; c != nil; c = c.NextSibling {
result.WriteString(getText(c))
}
return result.String()
}
crawler = func(node *html.Node) {
if node.Type == html.ElementNode && node.Data == params.subTag {
text := strings.TrimSpace(getText(node))
if strings.Contains(text, params.substring) {
values = append(values, text)
}
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
crawler(child)
}
}
crawler(doc)
return values
}
func (s *MandarakeParser) getPrice(rawStr []string) int {
re := regexp.MustCompile(`\([^)]*?([0-9,]+)[^)]*?\)`)
for _, str := range rawStr {
matches := re.FindStringSubmatch(str)
if len(matches) > 1 {
priceStr := strings.ReplaceAll(matches[1], ",", "")
price, err := strconv.Atoi(priceStr)
if err != nil {
fmt.Println(err)
return 0
}
return price
}
}
return 0
}
//new client
func (s *MandarakeParser) initClient2() error {
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
ctx, _ = chromedp.NewContext(ctx)
if err := chromedp.Run(ctx,
chromedp.Navigate(s.parseParams.cookieUrl),
chromedp.WaitVisible("body", chromedp.ByQuery),
); err != nil {
return fmt.Errorf("failed to navigate: %w", err)
}
var cookies []*network.Cookie
err := chromedp.Run(ctx, chromedp.ActionFunc(func(ctx context.Context) error {
var err error
cookies, err = network.GetCookies().Do(ctx)
return err
}))
if err != nil {
return fmt.Errorf("failed to get cookies: %w", err)
}
var httpCookies []*http.Cookie
for _, c := range cookies {
httpCookies = append(httpCookies, &http.Cookie{
Name: c.Name,
Value: c.Value,
Path: c.Path,
Domain: c.Domain,
Expires: float64ToTime(c.Expires),
Secure: c.Secure,
HttpOnly: c.HTTPOnly,
SameSite: convertSameSite(c.SameSite),
})
}
jar, err := cookiejar.New(nil)
if err != nil {
return err
}
u, err := url.Parse(s.parseParams.cookieUrl)
if err != nil {
return err
}
jar.SetCookies(u, httpCookies)
client := &http.Client{
Jar: jar,
Timeout: 30 * time.Second,
}
s.client = client
return nil
}
func convertSameSite(s network.CookieSameSite) http.SameSite {
switch s {
case network.CookieSameSiteStrict:
return http.SameSiteStrictMode
case network.CookieSameSiteLax:
return http.SameSiteLaxMode
case network.CookieSameSiteNone:
return http.SameSiteNoneMode
default:
return http.SameSiteDefaultMode
}
}
func float64ToTime(unixFloat float64) time.Time {
sec := int64(unixFloat)
nsec := int64((unixFloat - float64(sec)) * 1e9)
return time.Unix(sec, nsec)
}