package parsers import ( "context" "fmt" "github.com/chromedp/cdproto/network" "github.com/chromedp/chromedp" log "github.com/sirupsen/logrus" "golang.org/x/net/html" "net/http" "net/http/cookiejar" "net/url" "regexp" "slices" "strconv" "strings" "sync" "task-processor/internal/appState" "task-processor/internal/shared" "time" ) type MandarakeParser struct { goroutinesNumber int parseParams parseParams client *http.Client } type parseParams struct { userAgent string cookieUrl string single price ranged price taxMult float64 } type price struct { tag string attrKey string attrVal string subTag string substring string } func NewMandarakeParser(goroutinesNumber int) *MandarakeParser { p := parseParams{ userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", cookieUrl: "https://www.mandarake.co.jp/", single: price{ tag: "div", attrKey: "class", attrVal: "price", subTag: "p", substring: "円", }, ranged: price{ tag: "div", attrKey: "class", attrVal: "price_range", subTag: "p", substring: "円"}, taxMult: 1.1, } return &MandarakeParser{ goroutinesNumber: goroutinesNumber, parseParams: p, } } func (s *MandarakeParser) HandleTasks(tasks []shared.Task, sender chan shared.TaskResult, state *appState.State) { log.Debug("Handling Mandarake tasks") if err := s.initClient2(); err != nil { log.WithError(err).Error("Mandarake handler | Error initializing client") return } receiver := make(chan shared.Task, len(tasks)) for _, task := range tasks { receiver <- task } close(receiver) wg := sync.WaitGroup{} for i := 0; i < s.goroutinesNumber; i++ { wg.Add(1) go func() { defer wg.Done() s.worker(receiver, sender, state) }() } wg.Wait() log.Debug("Finished handling Mandarake tasks") } func (s *MandarakeParser) worker(receiver chan shared.Task, sender chan shared.TaskResult, state *appState.State) { for task := range receiver { log.WithField("task id", task.MerchUuid).Debug("Mandarake worker | Processing task") page, err := s.getPage(task.Link) if err != nil { log.WithError(err).Error("Mandarake worker | Error getting page for task") continue } if page == nil { log.Debug("Mandarake worker | Page for task is nil") continue } p := int32(s.getMinPrice(page)) sender <- shared.TaskResult{ MerchUuid: task.MerchUuid, Origin: task.Origin, Price: p, } } } // Deprecated: use initClient2 instead. func (s *MandarakeParser) initClient() error { //preload cookies for client req, err := http.NewRequest("GET", s.parseParams.cookieUrl, nil) if err != nil { return err } //TODO сделать один клиент с одним джаром client := http.Client{} req.Header.Set("User-Agent", s.parseParams.userAgent) result, err := client.Do(req) if err != nil { return err } defer result.Body.Close() c := result.Cookies() log.WithField("cookies", c).Debug("Mandarake handler | Get cookies") //make client jar, err := cookiejar.New(nil) if err != nil { log.WithError(err).Error("Mandarake | Cookie jar") return err } u, err := url.Parse(s.parseParams.cookieUrl) if err != nil { log.WithError(err).Error("Mandarake | Parse cookie URL") return err } jar.SetCookies(u, c) taskClient := &http.Client{ Timeout: time.Second * 30, Jar: jar, } s.client = taskClient return nil } func (s *MandarakeParser) getPage(url string) (*html.Node, error) { req, err := http.NewRequest("GET", url, nil) if err != nil { return nil, err } req.Header.Set("User-Agent", s.parseParams.userAgent) result, err := s.client.Do(req) if err != nil { return nil, err } doc, err := html.Parse(result.Body) if err != nil { return nil, err } return doc, nil } func (s *MandarakeParser) getMinPrice(page *html.Node) int { singlePriceNode := s.findNode(page, s.parseParams.single) if singlePriceNode == nil { return 0 } singlePriceStr := s.findData(singlePriceNode, s.parseParams.single) if singlePriceStr == nil { return 0 } var prices []int prices = append(prices, s.getPrice(singlePriceStr)) priceRangeNode := s.findNode(page, s.parseParams.ranged) if priceRangeNode != nil { priceFromRange := s.findData(priceRangeNode, s.parseParams.ranged) if priceFromRange != nil { withTax := int(float64(s.getPrice(priceFromRange)) * s.parseParams.taxMult) prices = append(prices, withTax) } } return slices.Min(prices) } func (s *MandarakeParser) findNode(doc *html.Node, params price) *html.Node { if doc == nil { return nil } var ( crawler func(*html.Node) result *html.Node ) crawler = func(node *html.Node) { if result != nil { return } if node.Type == html.ElementNode && node.Data == params.tag { for _, attr := range node.Attr { if attr.Key == params.attrKey && attr.Val == params.attrVal { result = node return } } } for child := node.FirstChild; child != nil; child = child.NextSibling { crawler(child) } } crawler(doc) return result } func (s *MandarakeParser) findData(doc *html.Node, params price) []string { if doc == nil { return nil } var ( crawler func(*html.Node) values []string getText func(*html.Node) string ) getText = func(n *html.Node) string { if n.Type == html.TextNode { return n.Data } var result strings.Builder for c := n.FirstChild; c != nil; c = c.NextSibling { result.WriteString(getText(c)) } return result.String() } crawler = func(node *html.Node) { if node.Type == html.ElementNode && node.Data == params.subTag { text := strings.TrimSpace(getText(node)) if strings.Contains(text, params.substring) { values = append(values, text) } } for child := node.FirstChild; child != nil; child = child.NextSibling { crawler(child) } } crawler(doc) return values } func (s *MandarakeParser) getPrice(rawStr []string) int { re := regexp.MustCompile(`\([^)]*?([0-9,]+)[^)]*?\)`) for _, str := range rawStr { matches := re.FindStringSubmatch(str) if len(matches) > 1 { priceStr := strings.ReplaceAll(matches[1], ",", "") price, err := strconv.Atoi(priceStr) if err != nil { fmt.Println(err) return 0 } return price } } return 0 } //new client func (s *MandarakeParser) initClient2() error { ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() ctx, _ = chromedp.NewContext(ctx) if err := chromedp.Run(ctx, chromedp.Navigate(s.parseParams.cookieUrl), chromedp.WaitVisible("body", chromedp.ByQuery), ); err != nil { return fmt.Errorf("failed to navigate: %w", err) } var cookies []*network.Cookie err := chromedp.Run(ctx, chromedp.ActionFunc(func(ctx context.Context) error { var err error cookies, err = network.GetCookies().Do(ctx) return err })) if err != nil { return fmt.Errorf("failed to get cookies: %w", err) } var httpCookies []*http.Cookie for _, c := range cookies { httpCookies = append(httpCookies, &http.Cookie{ Name: c.Name, Value: c.Value, Path: c.Path, Domain: c.Domain, Expires: float64ToTime(c.Expires), Secure: c.Secure, HttpOnly: c.HTTPOnly, SameSite: convertSameSite(c.SameSite), }) } jar, err := cookiejar.New(nil) if err != nil { return err } u, err := url.Parse(s.parseParams.cookieUrl) if err != nil { return err } jar.SetCookies(u, httpCookies) client := &http.Client{ Jar: jar, Timeout: 30 * time.Second, } s.client = client return nil } func convertSameSite(s network.CookieSameSite) http.SameSite { switch s { case network.CookieSameSiteStrict: return http.SameSiteStrictMode case network.CookieSameSiteLax: return http.SameSiteLaxMode case network.CookieSameSiteNone: return http.SameSiteNoneMode default: return http.SameSiteDefaultMode } } func float64ToTime(unixFloat float64) time.Time { sec := int64(unixFloat) nsec := int64((unixFloat - float64(sec)) * 1e9) return time.Unix(sec, nsec) }