price scrapper
This commit is contained in:
parent
d19f5f7621
commit
8922b8a4f0
7 changed files with 426 additions and 0 deletions
50
internal/scrapper/handleTasks.go
Normal file
50
internal/scrapper/handleTasks.go
Normal file
|
|
@ -0,0 +1,50 @@
|
||||||
|
package scrapper
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"errors"
|
||||||
|
"github.com/chromedp/chromedp"
|
||||||
|
log "github.com/sirupsen/logrus"
|
||||||
|
"scrapper-mandarake/internal/common"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (s *Scrapper) Start(ctx context.Context, tasksChan <-chan common.Task, resultsChan chan<- common.Result) error {
|
||||||
|
if s.goroutinesNumber <= 0 {
|
||||||
|
err := errors.New("gorutines num <= 0, abort")
|
||||||
|
log.WithError(err).Error(pkgLogHeader)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
poolCtx, cancel := context.WithCancel(ctx)
|
||||||
|
s.poolCancel = cancel
|
||||||
|
|
||||||
|
log.Infof("%v Start handling tasks", pkgLogHeader)
|
||||||
|
log.Infof("%v Setting up browser", pkgLogHeader)
|
||||||
|
cr, err := s.setupBrowser(poolCtx)
|
||||||
|
if err != nil {
|
||||||
|
log.WithError(err).Error(pkgLogHeader + logGetPrice + "failed to setup browser")
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
log.WithField("Copyright message", cr).Infof("%v Finished setting up browser.", pkgLogHeader)
|
||||||
|
|
||||||
|
allocCtx, allocCancel := chromedp.NewRemoteAllocator(poolCtx, s.externalBrowser)
|
||||||
|
s.allocCancel = allocCancel
|
||||||
|
|
||||||
|
log.Infof("%v processing tasks...", pkgLogHeader)
|
||||||
|
s.wg.Add(s.goroutinesNumber)
|
||||||
|
for i := 0; i < s.goroutinesNumber; i++ {
|
||||||
|
go s.worker(allocCtx, tasksChan, resultsChan)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Scrapper) Stop() {
|
||||||
|
if s.allocCancel != nil {
|
||||||
|
s.allocCancel()
|
||||||
|
}
|
||||||
|
|
||||||
|
if s.poolCancel != nil {
|
||||||
|
s.poolCancel()
|
||||||
|
}
|
||||||
|
s.wg.Wait()
|
||||||
|
}
|
||||||
40
internal/scrapper/handler.go
Normal file
40
internal/scrapper/handler.go
Normal file
|
|
@ -0,0 +1,40 @@
|
||||||
|
package scrapper
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
originName = "mandarake"
|
||||||
|
zeroPrice int32 = 0
|
||||||
|
taxMultiplier float64 = 1.1
|
||||||
|
pkgLogHeader = "Scrapper |"
|
||||||
|
logWorker = "worker:"
|
||||||
|
logTaskWarning = "task warning:"
|
||||||
|
logGetPrice = "get price:"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Scrapper struct {
|
||||||
|
externalBrowser string
|
||||||
|
goroutinesNumber int
|
||||||
|
taskTimeout time.Duration
|
||||||
|
poolCancel context.CancelFunc
|
||||||
|
allocCancel context.CancelFunc
|
||||||
|
wg sync.WaitGroup
|
||||||
|
}
|
||||||
|
|
||||||
|
type Deps struct {
|
||||||
|
ExternalBrowser string
|
||||||
|
GoroutinesNumber int
|
||||||
|
TaskTimeout int
|
||||||
|
}
|
||||||
|
|
||||||
|
func New(deps Deps) PriceScrapper {
|
||||||
|
return &Scrapper{
|
||||||
|
externalBrowser: deps.ExternalBrowser,
|
||||||
|
goroutinesNumber: deps.GoroutinesNumber,
|
||||||
|
taskTimeout: time.Second * time.Duration(deps.TaskTimeout),
|
||||||
|
}
|
||||||
|
}
|
||||||
30
internal/scrapper/helper.go
Normal file
30
internal/scrapper/helper.go
Normal file
|
|
@ -0,0 +1,30 @@
|
||||||
|
package scrapper
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"github.com/chromedp/chromedp"
|
||||||
|
log "github.com/sirupsen/logrus"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (s *Scrapper) setupBrowser(ctx context.Context) (string, error) {
|
||||||
|
allocCtx, allocCancel := chromedp.NewRemoteAllocator(ctx, s.externalBrowser)
|
||||||
|
defer allocCancel()
|
||||||
|
|
||||||
|
pageCtx, pageCancel := chromedp.NewContext(allocCtx, chromedp.WithLogf(func(string, ...any) {}))
|
||||||
|
defer pageCancel()
|
||||||
|
|
||||||
|
copyright := "No copyright div found."
|
||||||
|
|
||||||
|
if err := chromedp.Run(pageCtx,
|
||||||
|
chromedp.Navigate("https://www.mandarake.co.jp/"),
|
||||||
|
chromedp.WaitReady("body", chromedp.ByQuery),
|
||||||
|
chromedp.Text(`div.copyright`, ©right, chromedp.ByQuery, chromedp.AtLeast(0)),
|
||||||
|
chromedp.Navigate("https://www.mandarake.co.jp/index2.html"),
|
||||||
|
chromedp.WaitReady("body", chromedp.ByQuery),
|
||||||
|
); err != nil {
|
||||||
|
log.WithError(err).Error(pkgLogHeader + logGetPrice + "failed to get single price tag")
|
||||||
|
return copyright, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return copyright, nil
|
||||||
|
}
|
||||||
11
internal/scrapper/interface.go
Normal file
11
internal/scrapper/interface.go
Normal file
|
|
@ -0,0 +1,11 @@
|
||||||
|
package scrapper
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"scrapper-mandarake/internal/common"
|
||||||
|
)
|
||||||
|
|
||||||
|
type PriceScrapper interface {
|
||||||
|
Start(ctx context.Context, tasksChan <-chan common.Task, resultsChan chan<- common.Result) error
|
||||||
|
Stop()
|
||||||
|
}
|
||||||
167
internal/scrapper/service.go
Normal file
167
internal/scrapper/service.go
Normal file
|
|
@ -0,0 +1,167 @@
|
||||||
|
package scrapper
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"github.com/chromedp/chromedp"
|
||||||
|
log "github.com/sirupsen/logrus"
|
||||||
|
"regexp"
|
||||||
|
"scrapper-mandarake/internal/common"
|
||||||
|
"slices"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (s *Scrapper) getPrice(ctx context.Context, task common.Task) (int32, error) {
|
||||||
|
var (
|
||||||
|
singlePrice string
|
||||||
|
rangedPrice string
|
||||||
|
prices []int32
|
||||||
|
)
|
||||||
|
|
||||||
|
if err := chromedp.Run(ctx,
|
||||||
|
chromedp.Navigate(task.Link),
|
||||||
|
chromedp.WaitReady("body"),
|
||||||
|
chromedp.Text(`div.price`, &singlePrice, chromedp.ByQuery, chromedp.AtLeast(0)),
|
||||||
|
chromedp.Text(`div.price_range`, &rangedPrice, chromedp.ByQuery, chromedp.AtLeast(0)),
|
||||||
|
); err != nil {
|
||||||
|
log.WithError(err).Error(pkgLogHeader + logGetPrice + "failed to get single price tag")
|
||||||
|
return zeroPrice, err
|
||||||
|
}
|
||||||
|
singlePrice = strings.TrimSpace(singlePrice)
|
||||||
|
prices = append(prices, s.getSinglePriceWithTax(singlePrice))
|
||||||
|
|
||||||
|
rangedPrice = strings.TrimSpace(rangedPrice)
|
||||||
|
if rangedPrice != "" {
|
||||||
|
prices = append(prices, s.getMinimalPriceFromRangeWithTax(rangedPrice))
|
||||||
|
}
|
||||||
|
|
||||||
|
minimal := slices.Min(prices)
|
||||||
|
log.Infof(pkgLogHeader+"uuid: %s, price: %d", task.MerchUuid, minimal)
|
||||||
|
|
||||||
|
return minimal, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Scrapper) getMinimalPrice(sessionCtx context.Context, task common.Task) int32 {
|
||||||
|
var (
|
||||||
|
singlePrice string
|
||||||
|
rangedPrice string
|
||||||
|
)
|
||||||
|
|
||||||
|
if err := chromedp.Run(sessionCtx,
|
||||||
|
chromedp.Navigate(task.Link),
|
||||||
|
chromedp.WaitVisible("body", chromedp.ByQuery),
|
||||||
|
chromedp.Evaluate(`(document.querySelector('div.price')?.innerText || '').trim()`, &singlePrice),
|
||||||
|
chromedp.Evaluate(`(document.querySelector('div.price_range')?.innerText || '').trim()`, &rangedPrice),
|
||||||
|
); err != nil {
|
||||||
|
return zeroPrice
|
||||||
|
}
|
||||||
|
|
||||||
|
minimal := s.processPrices(singlePrice, rangedPrice)
|
||||||
|
log.Infof(pkgLogHeader+"uuid: %s, price: %d", task.MerchUuid, minimal)
|
||||||
|
return minimal
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Scrapper) processPrices(singlePrice, rangedPrice string) int32 {
|
||||||
|
var prices []int32
|
||||||
|
|
||||||
|
//in case of any errors or no price return zeroPrice const
|
||||||
|
//if success add to prices slice
|
||||||
|
if singlePrice != "" {
|
||||||
|
singlePrice = strings.TrimSpace(singlePrice)
|
||||||
|
counted, err := s.parseSinglePrice(singlePrice)
|
||||||
|
if err != nil {
|
||||||
|
log.WithFields(log.Fields{
|
||||||
|
"err": err.Error(),
|
||||||
|
"singlePrice": singlePrice,
|
||||||
|
}).Error(pkgLogHeader + logGetPrice + "failed to parse single price, returning zero price")
|
||||||
|
return zeroPrice
|
||||||
|
}
|
||||||
|
prices = append(prices, counted)
|
||||||
|
} else {
|
||||||
|
log.Warn(pkgLogHeader + logGetPrice + "single price not found")
|
||||||
|
return zeroPrice
|
||||||
|
}
|
||||||
|
|
||||||
|
//optional, adds price only if no errors and has non zero value
|
||||||
|
if rangedPrice != "" {
|
||||||
|
rangedPrice = strings.TrimSpace(rangedPrice)
|
||||||
|
counted, err := s.parseRangedPrice(rangedPrice)
|
||||||
|
if err != nil {
|
||||||
|
log.WithFields(log.Fields{
|
||||||
|
"err": err.Error(),
|
||||||
|
"rangedPrice": rangedPrice,
|
||||||
|
}).Error(pkgLogHeader + logGetPrice + "failed to parse ranged price")
|
||||||
|
} else {
|
||||||
|
if counted > 0 {
|
||||||
|
prices = append(prices, counted)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return slices.Min(prices)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Scrapper) getSinglePriceWithTax(rawPrice string) int32 {
|
||||||
|
re := regexp.MustCompile(`(\d+)\s*円`)
|
||||||
|
matches := re.FindStringSubmatch(rawPrice)
|
||||||
|
if len(matches) < 2 {
|
||||||
|
log.Error("Mandarake | Single price not found, returning zero price")
|
||||||
|
return zeroPrice
|
||||||
|
}
|
||||||
|
|
||||||
|
priceStr := matches[1]
|
||||||
|
price, err := strconv.Atoi(priceStr)
|
||||||
|
if err != nil {
|
||||||
|
log.Error("Mandarake | Failed to convert single price, returning zero price")
|
||||||
|
return zeroPrice
|
||||||
|
}
|
||||||
|
return int32(price)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Scrapper) getMinimalPriceFromRangeWithTax(priceRange string) int32 {
|
||||||
|
re := regexp.MustCompile(`他([\d,]+)円`)
|
||||||
|
matches := re.FindStringSubmatch(priceRange)
|
||||||
|
if len(matches) < 2 {
|
||||||
|
log.Error("Price not found in range, returning zero price")
|
||||||
|
return zeroPrice
|
||||||
|
}
|
||||||
|
|
||||||
|
priceStr := strings.ReplaceAll(matches[1], ",", "")
|
||||||
|
price, err := strconv.Atoi(priceStr)
|
||||||
|
if err != nil {
|
||||||
|
log.Error("Failed to convert minimal price in range, returning zero price")
|
||||||
|
return zeroPrice
|
||||||
|
}
|
||||||
|
|
||||||
|
return int32(float64(price) * taxMultiplier)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Scrapper) parseSinglePrice(rawPrice string) (int32, error) {
|
||||||
|
deCommaStr := strings.ReplaceAll(rawPrice, ",", "")
|
||||||
|
split := strings.Split(deCommaStr, "円")
|
||||||
|
finalPrice, err := s.countTax(split[0])
|
||||||
|
if err != nil {
|
||||||
|
return zeroPrice, err
|
||||||
|
}
|
||||||
|
return finalPrice, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Scrapper) parseRangedPrice(rawPrice string) (int32, error) {
|
||||||
|
deCommaStr := strings.ReplaceAll(rawPrice, ",", "")
|
||||||
|
split := strings.Split(deCommaStr, "円")
|
||||||
|
rm1 := strings.ReplaceAll(split[0], "(", "")
|
||||||
|
rm2 := strings.ReplaceAll(rm1, "他", "")
|
||||||
|
|
||||||
|
finalPrice, err := s.countTax(rm2)
|
||||||
|
if err != nil {
|
||||||
|
return zeroPrice, err
|
||||||
|
}
|
||||||
|
return finalPrice, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Scrapper) countTax(priceStr string) (int32, error) {
|
||||||
|
intPrice, err := strconv.Atoi(priceStr)
|
||||||
|
if err != nil {
|
||||||
|
return zeroPrice, err
|
||||||
|
}
|
||||||
|
return int32(float64(intPrice) * taxMultiplier), nil
|
||||||
|
}
|
||||||
76
internal/scrapper/service_test.go
Normal file
76
internal/scrapper/service_test.go
Normal file
|
|
@ -0,0 +1,76 @@
|
||||||
|
package scrapper
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestParser_processPrices(t *testing.T) {
|
||||||
|
type fields struct {
|
||||||
|
baseCtx context.Context
|
||||||
|
externalBrowser string
|
||||||
|
goroutinesNumber int
|
||||||
|
}
|
||||||
|
type args struct {
|
||||||
|
singlePrice string
|
||||||
|
rangedPrice string
|
||||||
|
}
|
||||||
|
|
||||||
|
var placeholderFields = fields{
|
||||||
|
baseCtx: context.Background(),
|
||||||
|
externalBrowser: "",
|
||||||
|
goroutinesNumber: 10,
|
||||||
|
}
|
||||||
|
|
||||||
|
//single := "18,000円 (税込 19,800円)"
|
||||||
|
//ranged := "(他15,000円~16,000円もあります)"
|
||||||
|
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
fields fields
|
||||||
|
args args
|
||||||
|
want int32
|
||||||
|
}{
|
||||||
|
//Cases
|
||||||
|
{name: "Full success", fields: placeholderFields, args: args{
|
||||||
|
singlePrice: "18,000円 (税込 19,800円)",
|
||||||
|
rangedPrice: "(他15,000円~16,000円もあります)",
|
||||||
|
}, want: 16500},
|
||||||
|
|
||||||
|
{name: "Single price only success 1", fields: placeholderFields, args: args{
|
||||||
|
singlePrice: "18,000円 (税込 19,800円)",
|
||||||
|
rangedPrice: "",
|
||||||
|
}, want: 19800},
|
||||||
|
|
||||||
|
{name: "Single price only success 2", fields: placeholderFields, args: args{
|
||||||
|
singlePrice: "18,000円 (税込 19,800円)",
|
||||||
|
rangedPrice: "no numbers in this string",
|
||||||
|
}, want: 19800},
|
||||||
|
|
||||||
|
{name: "zero single price success 1", fields: placeholderFields, args: args{
|
||||||
|
singlePrice: "",
|
||||||
|
rangedPrice: "",
|
||||||
|
}, want: 0},
|
||||||
|
|
||||||
|
{name: "zero single price success 2", fields: placeholderFields, args: args{
|
||||||
|
singlePrice: "no numbers in this string",
|
||||||
|
rangedPrice: "",
|
||||||
|
}, want: 0},
|
||||||
|
|
||||||
|
{name: "zero single price success 3", fields: placeholderFields, args: args{
|
||||||
|
singlePrice: "no numbers in this string",
|
||||||
|
rangedPrice: "no numbers in this string",
|
||||||
|
}, want: 0},
|
||||||
|
}
|
||||||
|
for _, tt := range tests {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
s := &Scrapper{
|
||||||
|
externalBrowser: tt.fields.externalBrowser,
|
||||||
|
goroutinesNumber: tt.fields.goroutinesNumber,
|
||||||
|
}
|
||||||
|
if got := s.processPrices(tt.args.singlePrice, tt.args.rangedPrice); got != tt.want {
|
||||||
|
t.Errorf("processPrices() = %v, want %v", got, tt.want)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
52
internal/scrapper/worker.go
Normal file
52
internal/scrapper/worker.go
Normal file
|
|
@ -0,0 +1,52 @@
|
||||||
|
package scrapper
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"github.com/chromedp/chromedp"
|
||||||
|
log "github.com/sirupsen/logrus"
|
||||||
|
"runtime/debug"
|
||||||
|
"scrapper-mandarake/internal/common"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (s *Scrapper) worker(ctx context.Context, tasksChan <-chan common.Task, resultsChan chan<- common.Result) {
|
||||||
|
defer func() {
|
||||||
|
if r := recover(); r != nil {
|
||||||
|
log.Errorf("%v %v PANIC: %v\n%s", pkgLogHeader, logWorker, r, debug.Stack())
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return
|
||||||
|
case task, ok := <-tasksChan:
|
||||||
|
if !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
taskCtx, taskCancel := chromedp.NewContext(ctx /* chromedp.WithLogf(log.Printf) */, chromedp.WithLogf(func(string, ...any) {}))
|
||||||
|
timeoutCtx, timeoutCancel := context.WithTimeout(taskCtx, s.taskTimeout)
|
||||||
|
|
||||||
|
log.WithField("task_uuid", task.MerchUuid).Infof("%v %v processing task", pkgLogHeader, logWorker)
|
||||||
|
|
||||||
|
//price will be zeroPrice value in case of any error or if price not found
|
||||||
|
price := s.getMinimalPrice(timeoutCtx, task)
|
||||||
|
result := common.Result{
|
||||||
|
MerchUuid: task.MerchUuid,
|
||||||
|
OriginName: originName,
|
||||||
|
Price: price,
|
||||||
|
}
|
||||||
|
|
||||||
|
select {
|
||||||
|
case resultsChan <- result:
|
||||||
|
case <-ctx.Done():
|
||||||
|
timeoutCancel()
|
||||||
|
taskCancel()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
timeoutCancel()
|
||||||
|
taskCancel()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Add table
Add a link
Reference in a new issue