From 7df0dcdf088eafc6afa0d3024bfc54f3e607e1d8 Mon Sep 17 00:00:00 2001 From: nquidox Date: Mon, 6 Oct 2025 20:33:35 +0300 Subject: [PATCH] parser impl --- internal/app/app.go | 2 +- internal/parsers/mandarake.go | 266 +++++++++++++++++++++++++++++++++- 2 files changed, 261 insertions(+), 7 deletions(-) diff --git a/internal/app/app.go b/internal/app/app.go index d9875ae..b92d385 100644 --- a/internal/app/app.go +++ b/internal/app/app.go @@ -72,7 +72,7 @@ func (app *App) Run() { //task processor handlers := map[string]parsers.TaskHandler{ shared.OriginSurugaya: parsers.NewSurugayaParser(ctx, surugayaScrapper), - shared.OriginMandarake: parsers.NewMandarakeParser(), + shared.OriginMandarake: parsers.NewMandarakeParser(app.numCPUs), } taskProcessor := processor.New(processor.Deps{ diff --git a/internal/parsers/mandarake.go b/internal/parsers/mandarake.go index cbf9bd0..bfecfbc 100644 --- a/internal/parsers/mandarake.go +++ b/internal/parsers/mandarake.go @@ -1,17 +1,271 @@ package parsers import ( + "fmt" log "github.com/sirupsen/logrus" + "golang.org/x/net/html" + "net/http" + "net/http/cookiejar" + "net/url" + "regexp" + "slices" + "strconv" + "strings" + "sync" "task-processor/internal/appState" "task-processor/internal/shared" + "time" ) -type MandarakeParser struct{} - -func NewMandarakeParser() *MandarakeParser { - return &MandarakeParser{} +type MandarakeParser struct { + goroutinesNumber int + parseParams parseParams + client *http.Client } -func (s *MandarakeParser) HandleTasks(task []shared.Task, sender chan shared.TaskResult, state *appState.State) { - log.Debug("Handling Mandarake Task") +type parseParams struct { + userAgent string + cookieUrl string + single price + ranged price + taxMult float64 +} + +type price struct { + tag string + attrKey string + attrVal string + subTag string + substring string +} + +func NewMandarakeParser(goroutinesNumber int) *MandarakeParser { + p := parseParams{ + userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", + cookieUrl: "https://www.mandarake.co.jp/", + single: price{ + tag: "div", + attrKey: "class", + attrVal: "price", + subTag: "p", + substring: "円", + }, + ranged: price{ + tag: "div", + attrKey: "class", + attrVal: "price_range", + subTag: "p", + substring: "円"}, + taxMult: 1.1, + } + + return &MandarakeParser{ + goroutinesNumber: goroutinesNumber, + parseParams: p, + } +} + +func (s *MandarakeParser) HandleTasks(tasks []shared.Task, sender chan shared.TaskResult, state *appState.State) { + log.Debug("Handling Mandarake tasks") + + if err := s.initClient(); err != nil { + log.WithError(err).Error("Mandarake handler | Error initializing client") + return + } + + receiver := make(chan shared.Task, len(tasks)) + for _, task := range tasks { + task.RetryCount = 3 + receiver <- task + } + close(receiver) + + wg := sync.WaitGroup{} + for i := 0; i < s.goroutinesNumber; i++ { + wg.Add(1) + go func() { + defer wg.Done() + s.worker(receiver, sender, state) + }() + } + wg.Wait() + log.Debug("Finished handling Mandarake tasks") +} + +func (s *MandarakeParser) worker(receiver chan shared.Task, sender chan shared.TaskResult, state *appState.State) { + for task := range receiver { + log.WithField("task id", task.MerchUuid).Debug("Mandarake worker | Processing task") + + page, err := s.getPage(task.Link) + if err != nil { + log.WithError(err).Error("Mandarake worker | Error getting page for task") + continue + } + + p := int32(s.getMinPrice(page)) + + sender <- shared.TaskResult{ + MerchUuid: task.MerchUuid, + Origin: task.Origin, + Price: p, + } + } +} + +func (s *MandarakeParser) initClient() error { + //preload cookies for client + req, err := http.NewRequest("GET", s.parseParams.cookieUrl, nil) + if err != nil { + return err + } + + client := http.Client{} + req.Header.Set("User-Agent", s.parseParams.userAgent) + + result, err := client.Do(req) + if err != nil { + return err + } + + c := result.Cookies() + log.WithField("cookies", c).Debug("Mandarake handler | Get cookies") + + //make client + jar, err := cookiejar.New(nil) + if err != nil { + log.WithError(err).Error("Mandarake | Init client") + return err + } + + u, err := url.Parse(s.parseParams.cookieUrl) + if err != nil { + return err + } + s.client.Jar.SetCookies(u, c) + + taskClient := &http.Client{ + Timeout: time.Second * 30, + Jar: jar, + } + + s.client = taskClient + return nil +} + +func (s *MandarakeParser) getPage(url string) (*html.Node, error) { + req, err := http.NewRequest("GET", url, nil) + if err != nil { + return nil, err + } + req.Header.Set("User-Agent", s.parseParams.userAgent) + + result, err := s.client.Do(req) + if err != nil { + return nil, err + } + + doc, err := html.Parse(result.Body) + if err != nil { + return nil, err + } + + return doc, nil +} + +func (s *MandarakeParser) getMinPrice(page *html.Node) int { + singlePriceNode := s.findNode(page, s.parseParams.single) + if singlePriceNode == nil { + return 0 + } + singlePriceStr := s.findData(singlePriceNode, s.parseParams.single) + if singlePriceStr == nil { + return 0 + } + + var prices []int + prices = append(prices, s.getPrice(singlePriceStr)) + + priceRangeNode := s.findNode(page, s.parseParams.ranged) + if priceRangeNode != nil { + priceFromRange := s.findData(priceRangeNode, s.parseParams.ranged) + if priceFromRange != nil { + withTax := int(float64(s.getPrice(priceFromRange)) * s.parseParams.taxMult) + prices = append(prices, withTax) + } + } + return slices.Min(prices) +} + +func (s *MandarakeParser) findNode(doc *html.Node, params price) *html.Node { + if doc == nil { + return nil + } + + var ( + crawler func(*html.Node) + result *html.Node + ) + + crawler = func(node *html.Node) { + if result != nil { + return + } + + if node.Type == html.ElementNode && node.Data == params.tag { + for _, attr := range node.Attr { + if attr.Key == params.attrKey && attr.Val == params.attrVal { + result = node + return + } + } + } + for child := node.FirstChild; child != nil; child = child.NextSibling { + crawler(child) + } + } + crawler(doc) + + return result +} + +func (s *MandarakeParser) findData(doc *html.Node, params price) []string { + if doc != nil { + var ( + crawler func(*html.Node) + values []string + ) + + crawler = func(node *html.Node) { + if node.Type == html.ElementNode && node.Data == params.tag { + if strings.Contains(node.FirstChild.Data, params.substring) { + values = append(values, node.FirstChild.Data) + } + } + for child := node.FirstChild; child != nil; child = child.NextSibling { + crawler(child) + } + } + crawler(doc) + + return values + } + return nil +} + +func (s *MandarakeParser) getPrice(rawStr []string) int { + re := regexp.MustCompile(`\([^)]*?([0-9,]+)[^)]*?\)`) + + for _, str := range rawStr { + matches := re.FindStringSubmatch(str) + if len(matches) > 1 { + priceStr := strings.ReplaceAll(matches[1], ",", "") + price, err := strconv.Atoi(priceStr) + if err != nil { + fmt.Println(err) + return 0 + } + return price + } + } + return 0 }