task-processor/internal/parsers/mandarake.go

272 lines
5.6 KiB
Go
Raw Normal View History

2025-10-02 20:35:53 +03:00
package parsers
import (
2025-10-06 20:33:35 +03:00
"fmt"
2025-10-02 20:35:53 +03:00
log "github.com/sirupsen/logrus"
2025-10-06 20:33:35 +03:00
"golang.org/x/net/html"
"net/http"
"net/http/cookiejar"
"net/url"
"regexp"
"slices"
"strconv"
"strings"
"sync"
2025-10-03 19:17:01 +03:00
"task-processor/internal/appState"
"task-processor/internal/shared"
2025-10-06 20:33:35 +03:00
"time"
2025-10-02 20:35:53 +03:00
)
2025-10-06 20:33:35 +03:00
type MandarakeParser struct {
goroutinesNumber int
parseParams parseParams
client *http.Client
}
type parseParams struct {
userAgent string
cookieUrl string
single price
ranged price
taxMult float64
}
type price struct {
tag string
attrKey string
attrVal string
subTag string
substring string
}
func NewMandarakeParser(goroutinesNumber int) *MandarakeParser {
p := parseParams{
userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
cookieUrl: "https://www.mandarake.co.jp/",
single: price{
tag: "div",
attrKey: "class",
attrVal: "price",
subTag: "p",
substring: "円",
},
ranged: price{
tag: "div",
attrKey: "class",
attrVal: "price_range",
subTag: "p",
substring: "円"},
taxMult: 1.1,
}
2025-10-02 20:35:53 +03:00
2025-10-06 20:33:35 +03:00
return &MandarakeParser{
goroutinesNumber: goroutinesNumber,
parseParams: p,
}
2025-10-02 20:35:53 +03:00
}
2025-10-06 20:33:35 +03:00
func (s *MandarakeParser) HandleTasks(tasks []shared.Task, sender chan shared.TaskResult, state *appState.State) {
log.Debug("Handling Mandarake tasks")
if err := s.initClient(); err != nil {
log.WithError(err).Error("Mandarake handler | Error initializing client")
return
}
receiver := make(chan shared.Task, len(tasks))
for _, task := range tasks {
task.RetryCount = 3
receiver <- task
}
close(receiver)
wg := sync.WaitGroup{}
for i := 0; i < s.goroutinesNumber; i++ {
wg.Add(1)
go func() {
defer wg.Done()
s.worker(receiver, sender, state)
}()
}
wg.Wait()
log.Debug("Finished handling Mandarake tasks")
}
func (s *MandarakeParser) worker(receiver chan shared.Task, sender chan shared.TaskResult, state *appState.State) {
for task := range receiver {
log.WithField("task id", task.MerchUuid).Debug("Mandarake worker | Processing task")
page, err := s.getPage(task.Link)
if err != nil {
log.WithError(err).Error("Mandarake worker | Error getting page for task")
continue
}
p := int32(s.getMinPrice(page))
sender <- shared.TaskResult{
MerchUuid: task.MerchUuid,
Origin: task.Origin,
Price: p,
}
}
}
func (s *MandarakeParser) initClient() error {
//preload cookies for client
req, err := http.NewRequest("GET", s.parseParams.cookieUrl, nil)
if err != nil {
return err
}
client := http.Client{}
req.Header.Set("User-Agent", s.parseParams.userAgent)
result, err := client.Do(req)
if err != nil {
return err
}
c := result.Cookies()
log.WithField("cookies", c).Debug("Mandarake handler | Get cookies")
//make client
jar, err := cookiejar.New(nil)
if err != nil {
log.WithError(err).Error("Mandarake | Init client")
return err
}
u, err := url.Parse(s.parseParams.cookieUrl)
if err != nil {
return err
}
s.client.Jar.SetCookies(u, c)
taskClient := &http.Client{
Timeout: time.Second * 30,
Jar: jar,
}
s.client = taskClient
return nil
}
func (s *MandarakeParser) getPage(url string) (*html.Node, error) {
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", s.parseParams.userAgent)
result, err := s.client.Do(req)
if err != nil {
return nil, err
}
doc, err := html.Parse(result.Body)
if err != nil {
return nil, err
}
return doc, nil
}
func (s *MandarakeParser) getMinPrice(page *html.Node) int {
singlePriceNode := s.findNode(page, s.parseParams.single)
if singlePriceNode == nil {
return 0
}
singlePriceStr := s.findData(singlePriceNode, s.parseParams.single)
if singlePriceStr == nil {
return 0
}
var prices []int
prices = append(prices, s.getPrice(singlePriceStr))
priceRangeNode := s.findNode(page, s.parseParams.ranged)
if priceRangeNode != nil {
priceFromRange := s.findData(priceRangeNode, s.parseParams.ranged)
if priceFromRange != nil {
withTax := int(float64(s.getPrice(priceFromRange)) * s.parseParams.taxMult)
prices = append(prices, withTax)
}
}
return slices.Min(prices)
}
func (s *MandarakeParser) findNode(doc *html.Node, params price) *html.Node {
if doc == nil {
return nil
}
var (
crawler func(*html.Node)
result *html.Node
)
crawler = func(node *html.Node) {
if result != nil {
return
}
if node.Type == html.ElementNode && node.Data == params.tag {
for _, attr := range node.Attr {
if attr.Key == params.attrKey && attr.Val == params.attrVal {
result = node
return
}
}
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
crawler(child)
}
}
crawler(doc)
return result
}
func (s *MandarakeParser) findData(doc *html.Node, params price) []string {
if doc != nil {
var (
crawler func(*html.Node)
values []string
)
crawler = func(node *html.Node) {
if node.Type == html.ElementNode && node.Data == params.tag {
if strings.Contains(node.FirstChild.Data, params.substring) {
values = append(values, node.FirstChild.Data)
}
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
crawler(child)
}
}
crawler(doc)
return values
}
return nil
}
func (s *MandarakeParser) getPrice(rawStr []string) int {
re := regexp.MustCompile(`\([^)]*?([0-9,]+)[^)]*?\)`)
for _, str := range rawStr {
matches := re.FindStringSubmatch(str)
if len(matches) > 1 {
priceStr := strings.ReplaceAll(matches[1], ",", "")
price, err := strconv.Atoi(priceStr)
if err != nil {
fmt.Println(err)
return 0
}
return price
}
}
return 0
2025-10-02 20:35:53 +03:00
}