package parsers import ( "fmt" log "github.com/sirupsen/logrus" "golang.org/x/net/html" "net/http" "net/http/cookiejar" "net/url" "regexp" "slices" "strconv" "strings" "sync" "task-processor/internal/appState" "task-processor/internal/shared" "time" ) type MandarakeParser struct { goroutinesNumber int parseParams parseParams client *http.Client } type parseParams struct { userAgent string cookieUrl string single price ranged price taxMult float64 } type price struct { tag string attrKey string attrVal string subTag string substring string } func NewMandarakeParser(goroutinesNumber int) *MandarakeParser { p := parseParams{ userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", cookieUrl: "https://www.mandarake.co.jp/", single: price{ tag: "div", attrKey: "class", attrVal: "price", subTag: "p", substring: "円", }, ranged: price{ tag: "div", attrKey: "class", attrVal: "price_range", subTag: "p", substring: "円"}, taxMult: 1.1, } return &MandarakeParser{ goroutinesNumber: goroutinesNumber, parseParams: p, } } func (s *MandarakeParser) HandleTasks(tasks []shared.Task, sender chan shared.TaskResult, state *appState.State) { log.Debug("Handling Mandarake tasks") if err := s.initClient(); err != nil { log.WithError(err).Error("Mandarake handler | Error initializing client") return } receiver := make(chan shared.Task, len(tasks)) for _, task := range tasks { receiver <- task } close(receiver) wg := sync.WaitGroup{} for i := 0; i < s.goroutinesNumber; i++ { wg.Add(1) go func() { defer wg.Done() s.worker(receiver, sender, state) }() } wg.Wait() log.Debug("Finished handling Mandarake tasks") } func (s *MandarakeParser) worker(receiver chan shared.Task, sender chan shared.TaskResult, state *appState.State) { for task := range receiver { log.WithField("task id", task.MerchUuid).Debug("Mandarake worker | Processing task") page, err := s.getPage(task.Link) if err != nil { log.WithError(err).Error("Mandarake worker | Error getting page for task") continue } if page == nil { log.Debug("Mandarake worker | Page for task is nil") continue } p := int32(s.getMinPrice(page)) sender <- shared.TaskResult{ MerchUuid: task.MerchUuid, Origin: task.Origin, Price: p, } } } func (s *MandarakeParser) initClient() error { //preload cookies for client req, err := http.NewRequest("GET", s.parseParams.cookieUrl, nil) if err != nil { return err } client := http.Client{} req.Header.Set("User-Agent", s.parseParams.userAgent) result, err := client.Do(req) if err != nil { return err } c := result.Cookies() log.WithField("cookies", c).Debug("Mandarake handler | Get cookies") //make client jar, err := cookiejar.New(nil) if err != nil { log.WithError(err).Error("Mandarake | Cookie jar") return err } u, err := url.Parse(s.parseParams.cookieUrl) if err != nil { log.WithError(err).Error("Mandarake | Parse cookie URL") return err } jar.SetCookies(u, c) taskClient := &http.Client{ Timeout: time.Second * 30, Jar: jar, } s.client = taskClient return nil } func (s *MandarakeParser) getPage(url string) (*html.Node, error) { req, err := http.NewRequest("GET", url, nil) if err != nil { return nil, err } req.Header.Set("User-Agent", s.parseParams.userAgent) result, err := s.client.Do(req) if err != nil { return nil, err } doc, err := html.Parse(result.Body) if err != nil { return nil, err } return doc, nil } func (s *MandarakeParser) getMinPrice(page *html.Node) int { singlePriceNode := s.findNode(page, s.parseParams.single) if singlePriceNode == nil { return 0 } singlePriceStr := s.findData(singlePriceNode, s.parseParams.single) if singlePriceStr == nil { return 0 } var prices []int prices = append(prices, s.getPrice(singlePriceStr)) priceRangeNode := s.findNode(page, s.parseParams.ranged) if priceRangeNode != nil { priceFromRange := s.findData(priceRangeNode, s.parseParams.ranged) if priceFromRange != nil { withTax := int(float64(s.getPrice(priceFromRange)) * s.parseParams.taxMult) prices = append(prices, withTax) } } return slices.Min(prices) } func (s *MandarakeParser) findNode(doc *html.Node, params price) *html.Node { if doc == nil { return nil } var ( crawler func(*html.Node) result *html.Node ) crawler = func(node *html.Node) { if result != nil { return } if node.Type == html.ElementNode && node.Data == params.tag { for _, attr := range node.Attr { if attr.Key == params.attrKey && attr.Val == params.attrVal { result = node return } } } for child := node.FirstChild; child != nil; child = child.NextSibling { crawler(child) } } crawler(doc) return result } func (s *MandarakeParser) findData(doc *html.Node, params price) []string { if doc == nil { return nil } var ( crawler func(*html.Node) values []string getText func(*html.Node) string ) getText = func(n *html.Node) string { if n.Type == html.TextNode { return n.Data } var result strings.Builder for c := n.FirstChild; c != nil; c = c.NextSibling { result.WriteString(getText(c)) } return result.String() } crawler = func(node *html.Node) { if node.Type == html.ElementNode && node.Data == params.subTag { text := strings.TrimSpace(getText(node)) if strings.Contains(text, params.substring) { values = append(values, text) } } for child := node.FirstChild; child != nil; child = child.NextSibling { crawler(child) } } crawler(doc) return values } func (s *MandarakeParser) getPrice(rawStr []string) int { re := regexp.MustCompile(`\([^)]*?([0-9,]+)[^)]*?\)`) for _, str := range rawStr { matches := re.FindStringSubmatch(str) if len(matches) > 1 { priceStr := strings.ReplaceAll(matches[1], ",", "") price, err := strconv.Atoi(priceStr) if err != nil { fmt.Println(err) return 0 } return price } } return 0 }