From 8922b8a4f01ff6c236de1287b7f6b4377301fe81 Mon Sep 17 00:00:00 2001 From: nquidox Date: Fri, 3 Apr 2026 20:54:10 +0300 Subject: [PATCH] price scrapper --- internal/scrapper/handleTasks.go | 50 +++++++++ internal/scrapper/handler.go | 40 +++++++ internal/scrapper/helper.go | 30 ++++++ internal/scrapper/interface.go | 11 ++ internal/scrapper/service.go | 167 ++++++++++++++++++++++++++++++ internal/scrapper/service_test.go | 76 ++++++++++++++ internal/scrapper/worker.go | 52 ++++++++++ 7 files changed, 426 insertions(+) create mode 100644 internal/scrapper/handleTasks.go create mode 100644 internal/scrapper/handler.go create mode 100644 internal/scrapper/helper.go create mode 100644 internal/scrapper/interface.go create mode 100644 internal/scrapper/service.go create mode 100644 internal/scrapper/service_test.go create mode 100644 internal/scrapper/worker.go diff --git a/internal/scrapper/handleTasks.go b/internal/scrapper/handleTasks.go new file mode 100644 index 0000000..ce31424 --- /dev/null +++ b/internal/scrapper/handleTasks.go @@ -0,0 +1,50 @@ +package scrapper + +import ( + "context" + "errors" + "github.com/chromedp/chromedp" + log "github.com/sirupsen/logrus" + "scrapper-mandarake/internal/common" +) + +func (s *Scrapper) Start(ctx context.Context, tasksChan <-chan common.Task, resultsChan chan<- common.Result) error { + if s.goroutinesNumber <= 0 { + err := errors.New("gorutines num <= 0, abort") + log.WithError(err).Error(pkgLogHeader) + return err + } + + poolCtx, cancel := context.WithCancel(ctx) + s.poolCancel = cancel + + log.Infof("%v Start handling tasks", pkgLogHeader) + log.Infof("%v Setting up browser", pkgLogHeader) + cr, err := s.setupBrowser(poolCtx) + if err != nil { + log.WithError(err).Error(pkgLogHeader + logGetPrice + "failed to setup browser") + return err + } + log.WithField("Copyright message", cr).Infof("%v Finished setting up browser.", pkgLogHeader) + + allocCtx, allocCancel := chromedp.NewRemoteAllocator(poolCtx, s.externalBrowser) + s.allocCancel = allocCancel + + log.Infof("%v processing tasks...", pkgLogHeader) + s.wg.Add(s.goroutinesNumber) + for i := 0; i < s.goroutinesNumber; i++ { + go s.worker(allocCtx, tasksChan, resultsChan) + } + return nil +} + +func (s *Scrapper) Stop() { + if s.allocCancel != nil { + s.allocCancel() + } + + if s.poolCancel != nil { + s.poolCancel() + } + s.wg.Wait() +} diff --git a/internal/scrapper/handler.go b/internal/scrapper/handler.go new file mode 100644 index 0000000..a0b7775 --- /dev/null +++ b/internal/scrapper/handler.go @@ -0,0 +1,40 @@ +package scrapper + +import ( + "context" + "sync" + "time" +) + +const ( + originName = "mandarake" + zeroPrice int32 = 0 + taxMultiplier float64 = 1.1 + pkgLogHeader = "Scrapper |" + logWorker = "worker:" + logTaskWarning = "task warning:" + logGetPrice = "get price:" +) + +type Scrapper struct { + externalBrowser string + goroutinesNumber int + taskTimeout time.Duration + poolCancel context.CancelFunc + allocCancel context.CancelFunc + wg sync.WaitGroup +} + +type Deps struct { + ExternalBrowser string + GoroutinesNumber int + TaskTimeout int +} + +func New(deps Deps) PriceScrapper { + return &Scrapper{ + externalBrowser: deps.ExternalBrowser, + goroutinesNumber: deps.GoroutinesNumber, + taskTimeout: time.Second * time.Duration(deps.TaskTimeout), + } +} diff --git a/internal/scrapper/helper.go b/internal/scrapper/helper.go new file mode 100644 index 0000000..2098b84 --- /dev/null +++ b/internal/scrapper/helper.go @@ -0,0 +1,30 @@ +package scrapper + +import ( + "context" + "github.com/chromedp/chromedp" + log "github.com/sirupsen/logrus" +) + +func (s *Scrapper) setupBrowser(ctx context.Context) (string, error) { + allocCtx, allocCancel := chromedp.NewRemoteAllocator(ctx, s.externalBrowser) + defer allocCancel() + + pageCtx, pageCancel := chromedp.NewContext(allocCtx, chromedp.WithLogf(func(string, ...any) {})) + defer pageCancel() + + copyright := "No copyright div found." + + if err := chromedp.Run(pageCtx, + chromedp.Navigate("https://www.mandarake.co.jp/"), + chromedp.WaitReady("body", chromedp.ByQuery), + chromedp.Text(`div.copyright`, ©right, chromedp.ByQuery, chromedp.AtLeast(0)), + chromedp.Navigate("https://www.mandarake.co.jp/index2.html"), + chromedp.WaitReady("body", chromedp.ByQuery), + ); err != nil { + log.WithError(err).Error(pkgLogHeader + logGetPrice + "failed to get single price tag") + return copyright, err + } + + return copyright, nil +} diff --git a/internal/scrapper/interface.go b/internal/scrapper/interface.go new file mode 100644 index 0000000..5aaef6e --- /dev/null +++ b/internal/scrapper/interface.go @@ -0,0 +1,11 @@ +package scrapper + +import ( + "context" + "scrapper-mandarake/internal/common" +) + +type PriceScrapper interface { + Start(ctx context.Context, tasksChan <-chan common.Task, resultsChan chan<- common.Result) error + Stop() +} diff --git a/internal/scrapper/service.go b/internal/scrapper/service.go new file mode 100644 index 0000000..c5a8000 --- /dev/null +++ b/internal/scrapper/service.go @@ -0,0 +1,167 @@ +package scrapper + +import ( + "context" + "github.com/chromedp/chromedp" + log "github.com/sirupsen/logrus" + "regexp" + "scrapper-mandarake/internal/common" + "slices" + "strconv" + "strings" +) + +func (s *Scrapper) getPrice(ctx context.Context, task common.Task) (int32, error) { + var ( + singlePrice string + rangedPrice string + prices []int32 + ) + + if err := chromedp.Run(ctx, + chromedp.Navigate(task.Link), + chromedp.WaitReady("body"), + chromedp.Text(`div.price`, &singlePrice, chromedp.ByQuery, chromedp.AtLeast(0)), + chromedp.Text(`div.price_range`, &rangedPrice, chromedp.ByQuery, chromedp.AtLeast(0)), + ); err != nil { + log.WithError(err).Error(pkgLogHeader + logGetPrice + "failed to get single price tag") + return zeroPrice, err + } + singlePrice = strings.TrimSpace(singlePrice) + prices = append(prices, s.getSinglePriceWithTax(singlePrice)) + + rangedPrice = strings.TrimSpace(rangedPrice) + if rangedPrice != "" { + prices = append(prices, s.getMinimalPriceFromRangeWithTax(rangedPrice)) + } + + minimal := slices.Min(prices) + log.Infof(pkgLogHeader+"uuid: %s, price: %d", task.MerchUuid, minimal) + + return minimal, nil +} + +func (s *Scrapper) getMinimalPrice(sessionCtx context.Context, task common.Task) int32 { + var ( + singlePrice string + rangedPrice string + ) + + if err := chromedp.Run(sessionCtx, + chromedp.Navigate(task.Link), + chromedp.WaitVisible("body", chromedp.ByQuery), + chromedp.Evaluate(`(document.querySelector('div.price')?.innerText || '').trim()`, &singlePrice), + chromedp.Evaluate(`(document.querySelector('div.price_range')?.innerText || '').trim()`, &rangedPrice), + ); err != nil { + return zeroPrice + } + + minimal := s.processPrices(singlePrice, rangedPrice) + log.Infof(pkgLogHeader+"uuid: %s, price: %d", task.MerchUuid, minimal) + return minimal +} + +func (s *Scrapper) processPrices(singlePrice, rangedPrice string) int32 { + var prices []int32 + + //in case of any errors or no price return zeroPrice const + //if success add to prices slice + if singlePrice != "" { + singlePrice = strings.TrimSpace(singlePrice) + counted, err := s.parseSinglePrice(singlePrice) + if err != nil { + log.WithFields(log.Fields{ + "err": err.Error(), + "singlePrice": singlePrice, + }).Error(pkgLogHeader + logGetPrice + "failed to parse single price, returning zero price") + return zeroPrice + } + prices = append(prices, counted) + } else { + log.Warn(pkgLogHeader + logGetPrice + "single price not found") + return zeroPrice + } + + //optional, adds price only if no errors and has non zero value + if rangedPrice != "" { + rangedPrice = strings.TrimSpace(rangedPrice) + counted, err := s.parseRangedPrice(rangedPrice) + if err != nil { + log.WithFields(log.Fields{ + "err": err.Error(), + "rangedPrice": rangedPrice, + }).Error(pkgLogHeader + logGetPrice + "failed to parse ranged price") + } else { + if counted > 0 { + prices = append(prices, counted) + } + } + } + return slices.Min(prices) +} + +func (s *Scrapper) getSinglePriceWithTax(rawPrice string) int32 { + re := regexp.MustCompile(`(\d+)\s*円`) + matches := re.FindStringSubmatch(rawPrice) + if len(matches) < 2 { + log.Error("Mandarake | Single price not found, returning zero price") + return zeroPrice + } + + priceStr := matches[1] + price, err := strconv.Atoi(priceStr) + if err != nil { + log.Error("Mandarake | Failed to convert single price, returning zero price") + return zeroPrice + } + return int32(price) +} + +func (s *Scrapper) getMinimalPriceFromRangeWithTax(priceRange string) int32 { + re := regexp.MustCompile(`他([\d,]+)円`) + matches := re.FindStringSubmatch(priceRange) + if len(matches) < 2 { + log.Error("Price not found in range, returning zero price") + return zeroPrice + } + + priceStr := strings.ReplaceAll(matches[1], ",", "") + price, err := strconv.Atoi(priceStr) + if err != nil { + log.Error("Failed to convert minimal price in range, returning zero price") + return zeroPrice + } + + return int32(float64(price) * taxMultiplier) +} + +func (s *Scrapper) parseSinglePrice(rawPrice string) (int32, error) { + deCommaStr := strings.ReplaceAll(rawPrice, ",", "") + split := strings.Split(deCommaStr, "円") + finalPrice, err := s.countTax(split[0]) + if err != nil { + return zeroPrice, err + } + return finalPrice, nil +} + +func (s *Scrapper) parseRangedPrice(rawPrice string) (int32, error) { + deCommaStr := strings.ReplaceAll(rawPrice, ",", "") + split := strings.Split(deCommaStr, "円") + rm1 := strings.ReplaceAll(split[0], "(", "") + rm2 := strings.ReplaceAll(rm1, "他", "") + + finalPrice, err := s.countTax(rm2) + if err != nil { + return zeroPrice, err + } + return finalPrice, nil +} + +func (s *Scrapper) countTax(priceStr string) (int32, error) { + intPrice, err := strconv.Atoi(priceStr) + if err != nil { + return zeroPrice, err + } + return int32(float64(intPrice) * taxMultiplier), nil +} diff --git a/internal/scrapper/service_test.go b/internal/scrapper/service_test.go new file mode 100644 index 0000000..117b35e --- /dev/null +++ b/internal/scrapper/service_test.go @@ -0,0 +1,76 @@ +package scrapper + +import ( + "context" + "testing" +) + +func TestParser_processPrices(t *testing.T) { + type fields struct { + baseCtx context.Context + externalBrowser string + goroutinesNumber int + } + type args struct { + singlePrice string + rangedPrice string + } + + var placeholderFields = fields{ + baseCtx: context.Background(), + externalBrowser: "", + goroutinesNumber: 10, + } + + //single := "18,000円 (税込 19,800円)" + //ranged := "(他15,000円~16,000円もあります)" + + tests := []struct { + name string + fields fields + args args + want int32 + }{ + //Cases + {name: "Full success", fields: placeholderFields, args: args{ + singlePrice: "18,000円 (税込 19,800円)", + rangedPrice: "(他15,000円~16,000円もあります)", + }, want: 16500}, + + {name: "Single price only success 1", fields: placeholderFields, args: args{ + singlePrice: "18,000円 (税込 19,800円)", + rangedPrice: "", + }, want: 19800}, + + {name: "Single price only success 2", fields: placeholderFields, args: args{ + singlePrice: "18,000円 (税込 19,800円)", + rangedPrice: "no numbers in this string", + }, want: 19800}, + + {name: "zero single price success 1", fields: placeholderFields, args: args{ + singlePrice: "", + rangedPrice: "", + }, want: 0}, + + {name: "zero single price success 2", fields: placeholderFields, args: args{ + singlePrice: "no numbers in this string", + rangedPrice: "", + }, want: 0}, + + {name: "zero single price success 3", fields: placeholderFields, args: args{ + singlePrice: "no numbers in this string", + rangedPrice: "no numbers in this string", + }, want: 0}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + s := &Scrapper{ + externalBrowser: tt.fields.externalBrowser, + goroutinesNumber: tt.fields.goroutinesNumber, + } + if got := s.processPrices(tt.args.singlePrice, tt.args.rangedPrice); got != tt.want { + t.Errorf("processPrices() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/internal/scrapper/worker.go b/internal/scrapper/worker.go new file mode 100644 index 0000000..5cdf56a --- /dev/null +++ b/internal/scrapper/worker.go @@ -0,0 +1,52 @@ +package scrapper + +import ( + "context" + "github.com/chromedp/chromedp" + log "github.com/sirupsen/logrus" + "runtime/debug" + "scrapper-mandarake/internal/common" +) + +func (s *Scrapper) worker(ctx context.Context, tasksChan <-chan common.Task, resultsChan chan<- common.Result) { + defer func() { + if r := recover(); r != nil { + log.Errorf("%v %v PANIC: %v\n%s", pkgLogHeader, logWorker, r, debug.Stack()) + } + }() + + for { + select { + case <-ctx.Done(): + return + case task, ok := <-tasksChan: + if !ok { + return + } + + taskCtx, taskCancel := chromedp.NewContext(ctx /* chromedp.WithLogf(log.Printf) */, chromedp.WithLogf(func(string, ...any) {})) + timeoutCtx, timeoutCancel := context.WithTimeout(taskCtx, s.taskTimeout) + + log.WithField("task_uuid", task.MerchUuid).Infof("%v %v processing task", pkgLogHeader, logWorker) + + //price will be zeroPrice value in case of any error or if price not found + price := s.getMinimalPrice(timeoutCtx, task) + result := common.Result{ + MerchUuid: task.MerchUuid, + OriginName: originName, + Price: price, + } + + select { + case resultsChan <- result: + case <-ctx.Done(): + timeoutCancel() + taskCancel() + return + } + + timeoutCancel() + taskCancel() + } + } +}